Fix CI failure

Abhishek-Varma · Abhishek-Varma · commit 9a9672aecfc3 · 2025-05-22T06:19:01.000-07:00
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp
@@ -431,6 +431,15 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
   SmallVector<SmallVector<int64_t>> outerPerm;
   SmallVector<PackingConfigPackingLevelAttr> packingConfigLevelsVal;
 
+  int64_t m0Tile = packPeelTiling.M0;
+  int64_t n0Tile = packPeelTiling.N0;
+  // For 4D matmul-like ops, only tile the outer dims.
+  // outer_tile_size = total_tile_size / inner_dim_size
+  if (is4DMatmulLikeOp(linalgOp)) {
+    m0Tile /= maybeInputDimsAndSizes.value().mSizes.back();
+    n0Tile /= maybeInputDimsAndSizes.value().nSizes.back();
+  }
+
   // Pack level => 1.
   // For 2D matmul-like ops, the first level is to pack operands from 2D to 4D.
   // If the input is a 4D matmul-like op, this level of packing is not needed.
@@ -441,6 +450,30 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
     packedSizesL0[nDims.back()] = packPeelTiling.n0Pack;
     packedSizesL0[kDims.back()] = packPeelTiling.k0Pack;
 
+    // TODO(avarma): This is currently a workaround for 1x1 AIE array to make
+    // those 2D matmul shapes work for which all of the operands get pulled in
+    // to L2 buffer. Once reprogramming of DMA ops is supported, we can get rid
+    // of this workaround. We need to add this only for pack-peel-4-level-tiling
+    // NOT pack-peel. The workaround just ensures that the tile size of first
+    // level is NOT equal to M,N by halving the n0Tile.
+    if (numRows == 1 && numCols == 1) {
+      auto getTotalSize = [](ArrayRef<int64_t> sizes) {
+        return std::accumulate(sizes.begin(), sizes.end(), 1,
+                               std::multiplies<int64_t>());
+      };
+
+      // Get the shape (M, N) of the full Matmul operation.
+      auto maybeInputDimsAndSizes = getInputDimsAndSizes(linalgOp);
+      int64_t M = getTotalSize(maybeInputDimsAndSizes.value().mSizes);
+      int64_t N = getTotalSize(maybeInputDimsAndSizes.value().nSizes);
+      // Check if the tile size generated is exactly same as operand size. If
+      // yes, halve n0Tile.
+      if (m0Tile == M && n0Tile == N) {
+        n0Tile /= 2;
+        if (n0Tile < packedSizesL0[nDims.back()])
+          packedSizesL0[nDims.back()] /= 2;
+      }
+    }
     transposePackIndices = {0, 1, 2};
     // There is no corresponding unpack for the specified pack operation
     // 0 is used when unpack is empty
@@ -507,36 +540,6 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
     assert(!batchDims.empty() && "expected batch dims not empty");
     tileSizeLevel0[batchDims[0]] = 1;
   }
-  int64_t m0Tile = packPeelTiling.M0;
-  int64_t n0Tile = packPeelTiling.N0;
-  // For 4D matmul-like ops, only tile the outer dims.
-  // outer_tile_size = total_tile_size / inner_dim_size
-  if (is4DMatmulLikeOp(linalgOp)) {
-    m0Tile /= maybeInputDimsAndSizes.value().mSizes.back();
-    n0Tile /= maybeInputDimsAndSizes.value().nSizes.back();
-  }
-  // TODO(avarma): This is currently a workaround for 1x1 AIE array to make
-  // those 2D matmul shapes work for which all of the operands get pulled in to
-  // L2 buffer. Once reprogramming of DMA ops is supported, we can get rid of
-  // this workaround. We need to add this only for pack-peel-4-level-tiling NOT
-  // pack-peel. The workaround just ensures that the tile size of first level is
-  // NOT equal to M,N by halving the n0Tile.
-  if (is2DMatmulLike && numRows == 1 && numCols == 1) {
-    auto getTotalSize = [](ArrayRef<int64_t> sizes) {
-      return std::accumulate(sizes.begin(), sizes.end(), 1,
-                             std::multiplies<int64_t>());
-    };
-
-    // Get the shape (M, N) of the full Matmul operation.
-    auto maybeInputDimsAndSizes = getInputDimsAndSizes(linalgOp);
-    int64_t M = getTotalSize(maybeInputDimsAndSizes.value().mSizes);
-    int64_t N = getTotalSize(maybeInputDimsAndSizes.value().nSizes);
-    // Check if the tile size generated is exactly same as operand size. If yes,
-    // halve n0Tile.
-    if (m0Tile == M && n0Tile == N) {
-      n0Tile /= 2;
-    }
-  }
   tileSizeLevel0[mDims[0]] = m0Tile;
   tileSizeLevel0[nDims[0]] = n0Tile;
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir
@@ -55,7 +55,7 @@ module {
 
 // Pack-peel-4-level tiling on 1x1 core : the tile size for N gets halved in this case.
 // PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[32, 256, 0], [1, 1, 0], [0, 0, 1], [1, 1, 0]]>
-// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config =  [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
+// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
 func.func @matmul_32x512x64_i32() {
   %c0_i32 = arith.constant 0 : i32
   %c0 = arith.constant 0 : index
@@ -70,3 +70,25 @@ func.func @matmul_32x512x64_i32() {
   iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32, 512], strides = [1, 1] : tensor<32x512xi32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x512xi32>>
   return
 }
+
+// -----
+
+// Based on above workaround this test shows the packing size of N also being halved
+// in case the tile size for N dimension becomes lesser than the corresponding packing size.
+
+// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[32, 16, 0], [1, 1, 0], [0, 0, 1], [1, 1, 0]]>
+// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 16, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
+func.func @matmul_dispatch_0_matmul_32x32x128_i32() {
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<32x128xi32>>
+  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x32xi32>>
+  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x32xi32>>
+  %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<32x128xi32>> -> tensor<32x128xi32>
+  %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x32xi32>> -> tensor<128x32xi32>
+  %5 = tensor.empty() : tensor<32x32xi32>
+  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<32x32xi32>) -> tensor<32x32xi32>
+  %7 = linalg.matmul ins(%3, %4 : tensor<32x128xi32>, tensor<128x32xi32>) outs(%6 : tensor<32x32xi32>) -> tensor<32x32xi32>
+  iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x32xi32>>
+  return
+}