[LoadStoreOpToLLVM] Transposed 2d load.

chengjunlu · chengjunlu · commit 942ca3751c6a · 2025-11-04T12:38:44.000Z
Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/python/test/unit/intel/test_block_io.py b/python/test/unit/intel/test_block_io.py
@@ -120,8 +120,9 @@ def warps_per_cta(layout):
 @pytest.mark.parametrize("layout", layouts)
 @pytest.mark.parametrize("load_block_ptr, store_block_ptr", [(True, True), (False, False), (True, False),
                                                              (False, True)])
+@pytest.mark.parametrize("transpose", [True, False])
 @pytest.mark.skipif(not is_xpu(), reason="Block store tests are specific to the XPU backend")
-def test_block_io(M, N, dtype_str, layout, load_block_ptr, store_block_ptr, device, tmp_path: pathlib.Path):
+def test_block_io(M, N, dtype_str, layout, load_block_ptr, store_block_ptr, transpose, device, tmp_path: pathlib.Path):
 
     warps = warps_per_cta(layout)
     num_warps = int(np.prod(warps))
@@ -132,16 +133,18 @@ def test_block_io(M, N, dtype_str, layout, load_block_ptr, store_block_ptr, devi
 
     support_block_io = torch.xpu.get_device_capability()['has_subgroup_2d_block_io']
 
+    block_io = "\"column_major\"" if transpose else "\"row_major\""
+
     if load_block_ptr:
         load_ops = f"""
-            %src_ptr = tt.make_tensor_ptr %src, [%M_i64, %N_i64], [%N_i64, %c1_i64], [%c0_i32, %c0_i32] {{order = array<i32: 1, 0>}} : <tensor<{M}x{N}x{ty}, #layout>>
-            %store_val = tt.load %src_ptr {{ttig.block_io = "row_major", boundaryCheck = array<i32: 0, 1>, padding = 1 : i32}} : !tt.ptr<tensor<{M}x{N}x{ty}, #layout>>
+            %src_ptr = tt.make_tensor_ptr %src, [%M_i64, %N_i64], {"[%c1_i64, %M_i64]" if transpose else "[%N_i64, %c1_i64]"}, [%c0_i32, %c0_i32] {{order = array<i32: 1, 0>}} : <tensor<{M}x{N}x{ty}, #layout>>
+            %store_val = tt.load %src_ptr {{ttig.block_io = {block_io}, boundaryCheck = array<i32: 0, 1>, padding = 1 : i32}} : !tt.ptr<tensor<{M}x{N}x{ty}, #layout>>
             """
     else:
         load_ops = f"""
             %src_base = tt.splat %src : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #layout>
-            %src_ptr = tt.addptr %src_base, %row_major_off : tensor<{M}x{N}x!tt.ptr<{ty}>, #layout>, tensor<{M}x{N}xi32, #layout>
-            %store_val = tt.load %src_ptr {{ttig.block_io = "row_major"}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #layout>
+            %src_ptr = tt.addptr %src_base, {"%col_major_off" if transpose else "%row_major_off" } : tensor<{M}x{N}x!tt.ptr<{ty}>, #layout>, tensor<{M}x{N}xi32, #layout>
+            %store_val = tt.load %src_ptr {{ttig.block_io = {block_io}}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #layout>
             """
     if store_block_ptr:
         store_ops = f"""
@@ -175,6 +178,12 @@ def test_block_io(M, N, dtype_str, layout, load_block_ptr, store_block_ptr, devi
             %7 = tt.broadcast %5 : tensor<1x{N}xi32, #layout> -> tensor<{M}x{N}xi32, #layout>
             %row_major_off = arith.addi %6, %7 : tensor<{M}x{N}xi32, #layout>
 
+            %stride_M = arith.constant dense<{M}> : tensor<1x{N}xi32, #layout>
+            %col_stride = arith.muli %5, %stride_M : tensor<1x{N}xi32, #layout>
+            %8 = tt.broadcast %2 : tensor<{M}x1xi32, #layout> -> tensor<{M}x{N}xi32, #layout>
+            %9 = tt.broadcast %col_stride : tensor<1x{N}xi32, #layout> -> tensor<{M}x{N}xi32, #layout>
+            %col_major_off = arith.addi %8, %9 : tensor<{M}x{N}xi32, #layout>
+
             {load_ops}
             {store_ops}
 
@@ -195,6 +204,8 @@ def test_block_io(M, N, dtype_str, layout, load_block_ptr, store_block_ptr, devi
     temp_file.write_text(ir)
     kernel = triton.compile(str(temp_file))
 
+    a = a.permute(1, 0).contiguous().permute(1, 0) if transpose else a
+
     kernel[(1, 1, 1)](a, x)
     assert torch.equal(a, x)
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -2521,18 +2521,6 @@ struct LoadOpToBlockIOConversion
     if (tileHeight * tileWidth * packedElemSizeInBits / 8 < GRF_SIZE)
       vBlocks = 1;
 
-    // TODO: use the axis info to general the handling for both regular pointer
-    // and block pointer.
-    const bool memoryRowMajor = isMemoryRowMajor(op);
-    // FIXME: Add support of column major.
-    if (!memoryRowMajor)
-      return failure();
-
-    unsigned contiguousDim = memoryRowMajor ? 1 : 0;
-    const bool isTransposeRequired = contiguousDim != colDim;
-    if (isTransposeRequired)
-      return matchAndRewriteTranspose(op, adaptor, rewriter);
-
     Location loc = op.getLoc();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
     MLIRContext *ctx = rewriter.getContext();
@@ -2661,6 +2649,55 @@ struct LoadOpToBlockIOConversion
       }
     }
 
+    // TODO: use the axis info to general the handling for both regular pointer
+    // and block pointer.
+    const bool memoryRowMajor = isMemoryRowMajor(op);
+    unsigned contiguousDim = memoryRowMajor ? 1 : 0;
+    const bool isTransposeRequired = contiguousDim != colDim;
+
+    if (isTransposeRequired) {
+      if (numPackedVals > 1)
+        return failure();
+      if (elemSizeInBits > 32)
+        return failure();
+      if (tileWidth > 32)
+        return failure(); // tileWidth is limited to 32 for transpose 2d load.
+
+      vBlocks = 1;
+
+      // use the d32 for transpose 2d load.
+      packedElemSizeInBits = 32;
+      numPackedVals = packedElemSizeInBits / elemSizeInBits;
+      if (numPackedVals > 1 && tileWidth != threadsPerWarp)
+        return failure(); // Couldn't use the transpose 2d load for un-packable
+                          // along tile height dim.
+      tileHeight = std::min(tileHeight / numPackedVals, 8);
+
+      if (tileHeight * tileWidth < threadsPerWarp)
+        return failure(); // The tile size is not large enough for IGC scalar
+                          // backend vectorization.
+      // transpose the width and height of the tile
+      std::swap(tileHeight, tileWidth);
+      // if (oneMatrixPerLoadForBT) {
+      //   // Only load 1 operand per inst on row.
+      //   numOperandsPer2DLoadM = 1;
+      //   tileHeight = elemsPerDPASInst[threadOrder[rank - 2]];
+      // } else {
+      //   // We can decompose the matrix returned by transposed large 2d load
+      //   // when threads per warp < column size. Otherwise we have to load one
+      //   // operand per inst.
+      //   // Note: the tileHeight and numOperandsPer2DLoadM are the column size
+      //   // now.
+      //   numOperandsPer2DLoadM =
+      //       (threadsPerWarp <= tileHeight) ? repCluster[rank - 1] : 1;
+      // }
+      // // The transpose 2d load only support 1 operand per inst on column.
+      // // (vBlocks = 1)
+      // numOperandsPer2DloadN = 1;
+      // // TODO: support load column major data.
+      // return failure();
+    }
+
     int64_t numElemsPerLoad = mlir::ceil(
         tileHeight * tileWidth * numPackedVals * vBlocks, (int)threadsPerWarp);
     unsigned numValuesPerLoad = mlir::ceil((int)numElemsPerLoad, numPackedVals);
@@ -2740,8 +2777,6 @@ struct LoadOpToBlockIOConversion
         }
       } break;
       case DpasEncodingAttr::OpIdx::OperandB: {
-        assert(numPackedVals == 1 &&
-               "invalid number of packed values for DPAS operand B.");
         unsigned elemsPerLanePerDPASInst =
             product<unsigned>(dpasLayout.getDPASInstShapeB()) / threadsPerWarp;
         // Block 2D contain at least one DotOp B.
@@ -2751,6 +2786,9 @@ struct LoadOpToBlockIOConversion
           if (tileHeight >= (opsPerChannel * sysDepth) &&
               ((opsPerChannel == 4 && elemSizeInBits == 8) ||
                (opsPerChannel == 2 && elemSizeInBits == 16))) {
+            assert(!isTransposeRequired ||
+                   opsPerChannel == numPackedVals &&
+                       "invalid opsPerChannel for transposed DotOp B");
             // Use the VNNI packing format for DotOp B layout.
             numValuesPerLoad = numElemsPerLoad / opsPerChannel;
             packedType = i32_ty;
@@ -2814,8 +2852,8 @@ struct LoadOpToBlockIOConversion
           /*tile_width*/ tileWidth,
           /*tile_height*/ tileHeight,
           /*v_blocks*/ vBlocks,
-          /*transpose*/ false,
-          /*vnni_transform*/ useVNNIFormat);
+          /*transpose*/ isTransposeRequired,
+          /*vnni_transform*/ !isTransposeRequired && useVNNIFormat);
 
       // When strides[0] is 0, we only want to load the first row, so we
       // set the base height to be 1. If tile height is bigger than 1,