Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions build_tools/ci/cpu_comparison/matmul_test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,21 @@
"--iree-amdaie-num-cols=1",
],
},
# TODO: Remove after testing all 1x1 shapes.
{
"M": 32,
"N": 512,
"K": 64,
"input_type": "i32",
"acc_type": "i32",
"tile_pipeline": "pack-peel-4-level-tiling",
"name_suffix": "OneCore_npu4",
"additional_labels": ["OneCore"],
"aie_compilation_flags": [
"--iree-amdaie-num-rows=1",
"--iree-amdaie-num-cols=1",
],
},
{
"M": 64,
"N": 128,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,21 @@ FailureOr<ParameterSetting> ParameterSetting::create(
TileSize maxL0Size = selectL2TileSizes(tileParams, m0Pack, n0Pack);
M0 = maxL0Size.M;
N0 = maxL0Size.N;

// TODO(avarma): This is currently a workaround for 1x1 AIE array to make
// those 2D matmul shapes work for which all of the operands get pulled in
// to L2 buffer. Once reprogramming of DMA ops is supported, we can get rid
// of this workaround. We need to add this only for pack-peel-4-level-tiling
// NOT pack-peel. The workaround just ensures that the tile size of first
// level is NOT equal to M, N by halving the N0 tile.
if (numRows == 1 && numCols == 1) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you only see this issue for 1x1? I would expect it for any number of cores?

// Check if the tile size generated is exactly same as operand size. If
// yes, halve N0 tile.
if (M0 == M && N0 == N) {
N0 /= 2;
if (N0 < n0Pack) n0Pack /= 2;
}
}
}

// Currently there is only one level of tiling for K dimension, and the packed
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{target-device=npu4})' %s | FileCheck %s
// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{target-device=npu4 use-tile-pipeline=pack-peel-4-level-tiling})' %s | FileCheck %s --check-prefix=PACK-PEEL-4-LEVEL
// RUN: iree-opt --split-input-file --pass-pipeline='builtin.module(iree-amdaie-lowering-strategy{target-device=npu4 use-tile-pipeline=pack-peel-4-level-tiling num-rows=1 num-cols=1})' %s | FileCheck %s --check-prefix=PACK-PEEL-4-LEVEL-1-CORE

// CHECK: #config = #iree_codegen.lowering_config<tile_sizes = [
// CHECK-SAME: [128, 128, 0], [0, 0, 1], [1, 1, 0]
Expand Down Expand Up @@ -40,3 +41,54 @@ module {
return
}
}

// -----

// Tests a matmul shape for pack-peel-4-level-tiling in which the tile size generated
// should ideally be equal to the M, N, K dimension size of the matmul - but it won't work
// until support for DMA ops' reconfiguration is added. The workaround therefore halves
// the tile size for N as N/2.

// Pack-peel-4-level tiling on 4x4 cores : the tile size remains maximum in this case.
// PACK-PEEL-4-LEVEL{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[32, 512, 0], [4, 4, 0], [0, 0, 1], [1, 1, 0]]>
// PACK-PEEL-4-LEVEL{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [8, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>

// Pack-peel-4-level tiling on 1x1 core : the tile size for N gets halved in this case.
// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[32, 256, 0], [1, 1, 0], [0, 0, 1], [1, 1, 0]]>
// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
func.func @matmul_32x512x64_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<32x64xi32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x512xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x512xi32>>
%3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 64], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<32x64xi32>> -> tensor<32x64xi32>
%4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [64, 512], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<64x512xi32>> -> tensor<64x512xi32>
%5 = tensor.empty() : tensor<32x512xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<32x512xi32>) -> tensor<32x512xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<32x64xi32>, tensor<64x512xi32>) outs(%6 : tensor<32x512xi32>) -> tensor<32x512xi32>
iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32, 512], strides = [1, 1] : tensor<32x512xi32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x512xi32>>
return
}

// -----

// Based on above workaround this test shows the packing size of N also being halved
// in case the tile size for N dimension becomes less than the corresponding packing size.

// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[32, 16, 0], [1, 1, 0], [0, 0, 1], [1, 1, 0]]>
// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 16, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
func.func @matmul_dispatch_0_matmul_32x32x128_i32() {
%c0_i32 = arith.constant 0 : i32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<32x128xi32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x32xi32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x32xi32>>
%3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<32x128xi32>> -> tensor<32x128xi32>
%4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x32xi32>> -> tensor<128x32xi32>
%5 = tensor.empty() : tensor<32x32xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<32x32xi32>) -> tensor<32x32xi32>
%7 = linalg.matmul ins(%3, %4 : tensor<32x128xi32>, tensor<128x32xi32>) outs(%6 : tensor<32x32xi32>) -> tensor<32x32xi32>
iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x32xi32>>
return
}
Loading