nod-ai · Abhishek-Varma · May 22, 2025 · May 22, 2025 · May 26, 2025 · jtuyls
diff --git a/build_tools/ci/cpu_comparison/matmul_test_config.py b/build_tools/ci/cpu_comparison/matmul_test_config.py
@@ -86,6 +86,21 @@
             "--iree-amdaie-num-cols=1",
         ],
     },
+    # TODO: Remove after testing all 1x1 shapes.
+    {
+        "M": 32,
+        "N": 512,
+        "K": 64,
+        "input_type": "i32",
+        "acc_type": "i32",
+        "tile_pipeline": "pack-peel-4-level-tiling",
+        "name_suffix": "OneCore_npu4",
+        "additional_labels": ["OneCore"],
+        "aie_compilation_flags": [
+            "--iree-amdaie-num-rows=1",
+            "--iree-amdaie-num-cols=1",
+        ],
+    },
     {
         "M": 64,
         "N": 128,

@@ -303,6 +303,21 @@ FailureOr<ParameterSetting> ParameterSetting::create(
     TileSize maxL0Size = selectL2TileSizes(tileParams, m0Pack, n0Pack);
     M0 = maxL0Size.M;
     N0 = maxL0Size.N;
+
+    // TODO(avarma): This is currently a workaround for 1x1 AIE array to make
+    // those 2D matmul shapes work for which all of the operands get pulled in
+    // to L2 buffer. Once reprogramming of DMA ops is supported, we can get rid
+    // of this workaround. We need to add this only for pack-peel-4-level-tiling
+    // NOT pack-peel. The workaround just ensures that the tile size of first
+    // level is NOT equal to M, N by halving the N0 tile.
+    if (numRows == 1 && numCols == 1) {
+      // Check if the tile size generated is exactly same as operand size. If
+      // yes, halve N0 tile.
+      if (M0 == M && N0 == N) {
+        N0 /= 2;
+        if (N0 < n0Pack) n0Pack /= 2;
+      }
+    }
   }
 
   // Currently there is only one level of tiling for K dimension, and the packed

@@ -1,5 +1,6 @@
 // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{target-device=npu4})' %s | FileCheck %s
 // RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{target-device=npu4 use-tile-pipeline=pack-peel-4-level-tiling})' %s | FileCheck %s --check-prefix=PACK-PEEL-4-LEVEL
+// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{target-device=npu4 use-tile-pipeline=pack-peel-4-level-tiling num-rows=1 num-cols=1})' %s | FileCheck %s --check-prefix=PACK-PEEL-4-LEVEL-1-CORE
 
 // CHECK:       #config = #iree_codegen.lowering_config<tile_sizes = [
 // CHECK-SAME:                [128, 128, 0], [0, 0, 1], [1, 1, 0]
@@ -40,3 +41,54 @@ module {
     return
   }
 }
+
+// -----
+
+// Tests a matmul shape for pack-peel-4-level-tiling in which the tile size generated
+// should ideally be equal to the M, N, K dimension size of the matmul - but it won't work
+// until support for DMA ops' reconfiguration is added. The workaround therefore halves
+// the tile size for N as N/2.
+
+// Pack-peel-4-level tiling on 4x4 cores : the tile size remains maximum in this case.
+// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[32, 512, 0], [4, 4, 0], [0, 0, 1], [1, 1, 0]]>
+// PACK-PEEL-4-LEVEL{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [8, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
+
+// Pack-peel-4-level tiling on 1x1 core : the tile size for N gets halved in this case.
+// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[32, 256, 0], [1, 1, 0], [0, 0, 1], [1, 1, 0]]>
+// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
+func.func @matmul_32x512x64_i32() {
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<32x64xi32>>
+  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x512xi32>>
+  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x512xi32>>
+  %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 64], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<32x64xi32>> -> tensor<32x64xi32>
+  %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [64, 512], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x512xi32>> -> tensor<64x512xi32>
+  %5 = tensor.empty() : tensor<32x512xi32>
+  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<32x512xi32>) -> tensor<32x512xi32>
+  %7 = linalg.matmul ins(%3, %4 : tensor<32x64xi32>, tensor<64x512xi32>) outs(%6 : tensor<32x512xi32>) -> tensor<32x512xi32>
+  iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32, 512], strides = [1, 1] : tensor<32x512xi32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x512xi32>>
+  return
+}
+
+// -----
+
+// Based on above workaround this test shows the packing size of N also being halved
+// in case the tile size for N dimension becomes less than the corresponding packing size.
+
+// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[32, 16, 0], [1, 1, 0], [0, 0, 1], [1, 1, 0]]>
+// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 16, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
+func.func @matmul_dispatch_0_matmul_32x32x128_i32() {
+  %c0_i32 = arith.constant 0 : i32
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<32x128xi32>>
+  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x32xi32>>
+  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x32xi32>>
+  %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<32x128xi32>> -> tensor<32x128xi32>
+  %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x32xi32>> -> tensor<128x32xi32>
+  %5 = tensor.empty() : tensor<32x32xi32>
+  %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<32x32xi32>) -> tensor<32x32xi32>
+  %7 = linalg.matmul ins(%3, %4 : tensor<32x128xi32>, tensor<128x32xi32>) outs(%6 : tensor<32x32xi32>) -> tensor<32x32xi32>
+  iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x32xi32>>
+  return
+}