[mxfp8 moe training] fix torch ref impl of SF blocked layout per group along K

danielvegamyhre · danielvegamyhre · commit b15065bfe519 · 2026-01-08T18:34:19.000Z
diff --git a/test/prototype/moe_training/test_kernels.py b/test/prototype/moe_training/test_kernels.py
@@ -313,9 +313,6 @@ def test_mxfp8_per_group_blocked_scales_3d(
     )
 
 
-@pytest.mark.skip(
-    "Temporarily disable and use e2e training numerical tests instead. See: https://github.com/pytorch/ao/pull/2990#discussion_r2354167396"
-)
 @skip_if_rocm("ROCm enablement in progress")
 @pytest.mark.parametrize("m", [256, 512, 1024, 5120])
 @pytest.mark.parametrize("total_k", [512, 1024, 2048, 4096, 8192, 16384])
diff --git a/torchao/prototype/moe_training/kernels/mxfp8/quant.py b/torchao/prototype/moe_training/kernels/mxfp8/quant.py
@@ -100,11 +100,15 @@ def torch_to_blocked_2d_K_groups(
     num_groups = group_offs.shape[0]
 
     # Each group will require a variable amount of padding, so to avoid d2h sync causing by iterating over each group,
-    # Triton kernel will use an upper bound of adding 4 padding cols to each group.
-    # (This torch impl is used as a reference for correctness, so we must match the triton kernel's impl).
     total_K_padded = total_K + num_groups * 4
     blocked_scales = x_scales.new_zeros(padded_M, total_K_padded)
 
+    # Flattened view for easier indexing when writing to subregions of memory
+    blocked_scales_flat = blocked_scales.view(-1)
+
+    BLOCK_ROWS, BLOCK_COLS = 128, 4
+    output_stride_per_block = BLOCK_ROWS * BLOCK_COLS  # 512
+
     start_col_after_padding_list = [0]
     group_start_idx = 0
     for i, group_end_idx in enumerate(group_offs.tolist()):
@@ -119,14 +123,37 @@ def torch_to_blocked_2d_K_groups(
         group_scales_blocked = to_blocked(group_scales)
         cols_after_padding = ceil_div(group_size, 4) * 4
 
-        # Write output to subtensor
-        blocked_scales[
-            :,
-            prev_start_col_after_padding : prev_start_col_after_padding
-            + cols_after_padding,
-        ] = group_scales_blocked.reshape(-1, cols_after_padding)
+        num_row_blocks = ceil_div(M, 128)
+        num_col_blocks = cols_after_padding // 4
 
-        # Calculate the start row after padding
+        # Reshape blocked scales from flattened format to (num_row_blocks, num_col_blocks, ...)
+        # so we can write each SF tile to its output buffer individually.
+        group_scales_reshaped = group_scales_blocked.view(
+            num_row_blocks, num_col_blocks, -1
+        )
+        out_group_base_offset = prev_start_col_after_padding * padded_M
+
+        # For each SF tile, write to the output tensor
+        for row_block in range(num_row_blocks):
+            for col_block in range(num_col_blocks):
+                block_data = group_scales_reshaped[row_block, col_block]
+
+                stride_per_row_of_blocks_in_group = (
+                    num_col_blocks * output_stride_per_block
+                )
+                offset_in_group = (
+                    row_block * stride_per_row_of_blocks_in_group
+                    + col_block * output_stride_per_block
+                )
+                final_offset = out_group_base_offset + offset_in_group
+
+                # flattened (512,) for (128,4) sf tile
+                block_flat = block_data.reshape(-1)
+                blocked_scales_flat[
+                    final_offset : final_offset + output_stride_per_block
+                ] = block_flat
+
+        # Calculate the start col after padding
         new_start_col = prev_start_col_after_padding + cols_after_padding
         start_col_after_padding_list.append(new_start_col)
 

Original file line number	Diff line number	Diff line change
`@@ -313,9 +313,6 @@ def test_mxfp8_per_group_blocked_scales_3d(`
`313`	`313`	`)`
`314`	`314`
`315`	`315`
`316`		`-@pytest.mark.skip(`
`317`		`- "Temporarily disable and use e2e training numerical tests instead. See: https://github.com/pytorch/ao/pull/2990#discussion_r2354167396"`
`318`		`-)`
`319`	`316`	`@skip_if_rocm("ROCm enablement in progress")`
`320`	`317`	`@pytest.mark.parametrize("m", [256, 512, 1024, 5120])`
`321`	`318`	`@pytest.mark.parametrize("total_k", [512, 1024, 2048, 4096, 8192, 16384])`