Skip to content

Commit 9a9672a

Browse files
Fix CI failure
1 parent 2651270 commit 9a9672a

File tree

2 files changed

+56
-31
lines changed

2 files changed

+56
-31
lines changed

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,15 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
431431
SmallVector<SmallVector<int64_t>> outerPerm;
432432
SmallVector<PackingConfigPackingLevelAttr> packingConfigLevelsVal;
433433

434+
int64_t m0Tile = packPeelTiling.M0;
435+
int64_t n0Tile = packPeelTiling.N0;
436+
// For 4D matmul-like ops, only tile the outer dims.
437+
// outer_tile_size = total_tile_size / inner_dim_size
438+
if (is4DMatmulLikeOp(linalgOp)) {
439+
m0Tile /= maybeInputDimsAndSizes.value().mSizes.back();
440+
n0Tile /= maybeInputDimsAndSizes.value().nSizes.back();
441+
}
442+
434443
// Pack level => 1.
435444
// For 2D matmul-like ops, the first level is to pack operands from 2D to 4D.
436445
// If the input is a 4D matmul-like op, this level of packing is not needed.
@@ -441,6 +450,30 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
441450
packedSizesL0[nDims.back()] = packPeelTiling.n0Pack;
442451
packedSizesL0[kDims.back()] = packPeelTiling.k0Pack;
443452

453+
// TODO(avarma): This is currently a workaround for 1x1 AIE array to make
454+
// those 2D matmul shapes work for which all of the operands get pulled in
455+
// to L2 buffer. Once reprogramming of DMA ops is supported, we can get rid
456+
// of this workaround. We need to add this only for pack-peel-4-level-tiling
457+
// NOT pack-peel. The workaround just ensures that the tile size of first
458+
// level is NOT equal to M,N by halving the n0Tile.
459+
if (numRows == 1 && numCols == 1) {
460+
auto getTotalSize = [](ArrayRef<int64_t> sizes) {
461+
return std::accumulate(sizes.begin(), sizes.end(), 1,
462+
std::multiplies<int64_t>());
463+
};
464+
465+
// Get the shape (M, N) of the full Matmul operation.
466+
auto maybeInputDimsAndSizes = getInputDimsAndSizes(linalgOp);
467+
int64_t M = getTotalSize(maybeInputDimsAndSizes.value().mSizes);
468+
int64_t N = getTotalSize(maybeInputDimsAndSizes.value().nSizes);
469+
// Check if the tile size generated is exactly same as operand size. If
470+
// yes, halve n0Tile.
471+
if (m0Tile == M && n0Tile == N) {
472+
n0Tile /= 2;
473+
if (n0Tile < packedSizesL0[nDims.back()])
474+
packedSizesL0[nDims.back()] /= 2;
475+
}
476+
}
444477
transposePackIndices = {0, 1, 2};
445478
// There is no corresponding unpack for the specified pack operation
446479
// 0 is used when unpack is empty
@@ -507,36 +540,6 @@ static LogicalResult setRootConfigForPackPeel4LevelTilingPipeline(
507540
assert(!batchDims.empty() && "expected batch dims not empty");
508541
tileSizeLevel0[batchDims[0]] = 1;
509542
}
510-
int64_t m0Tile = packPeelTiling.M0;
511-
int64_t n0Tile = packPeelTiling.N0;
512-
// For 4D matmul-like ops, only tile the outer dims.
513-
// outer_tile_size = total_tile_size / inner_dim_size
514-
if (is4DMatmulLikeOp(linalgOp)) {
515-
m0Tile /= maybeInputDimsAndSizes.value().mSizes.back();
516-
n0Tile /= maybeInputDimsAndSizes.value().nSizes.back();
517-
}
518-
// TODO(avarma): This is currently a workaround for 1x1 AIE array to make
519-
// those 2D matmul shapes work for which all of the operands get pulled in to
520-
// L2 buffer. Once reprogramming of DMA ops is supported, we can get rid of
521-
// this workaround. We need to add this only for pack-peel-4-level-tiling NOT
522-
// pack-peel. The workaround just ensures that the tile size of first level is
523-
// NOT equal to M,N by halving the n0Tile.
524-
if (is2DMatmulLike && numRows == 1 && numCols == 1) {
525-
auto getTotalSize = [](ArrayRef<int64_t> sizes) {
526-
return std::accumulate(sizes.begin(), sizes.end(), 1,
527-
std::multiplies<int64_t>());
528-
};
529-
530-
// Get the shape (M, N) of the full Matmul operation.
531-
auto maybeInputDimsAndSizes = getInputDimsAndSizes(linalgOp);
532-
int64_t M = getTotalSize(maybeInputDimsAndSizes.value().mSizes);
533-
int64_t N = getTotalSize(maybeInputDimsAndSizes.value().nSizes);
534-
// Check if the tile size generated is exactly same as operand size. If yes,
535-
// halve n0Tile.
536-
if (m0Tile == M && n0Tile == N) {
537-
n0Tile /= 2;
538-
}
539-
}
540543
tileSizeLevel0[mDims[0]] = m0Tile;
541544
tileSizeLevel0[nDims[0]] = n0Tile;
542545

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy_objectfifo_npu4.mlir

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ module {
5555

5656
// Pack-peel-4-level tiling on 1x1 core : the tile size for N gets halved in this case.
5757
// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[32, 256, 0], [1, 1, 0], [0, 0, 1], [1, 1, 0]]>
58-
// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
58+
// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 32, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
5959
func.func @matmul_32x512x64_i32() {
6060
%c0_i32 = arith.constant 0 : i32
6161
%c0 = arith.constant 0 : index
@@ -70,3 +70,25 @@ func.func @matmul_32x512x64_i32() {
7070
iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32, 512], strides = [1, 1] : tensor<32x512xi32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x512xi32>>
7171
return
7272
}
73+
74+
// -----
75+
76+
// Based on above workaround this test shows the packing size of N also being halved
77+
// in case the tile size for N dimension becomes lesser than the corresponding packing size.
78+
79+
// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #config = #iree_codegen.lowering_config<tile_sizes = [[32, 16, 0], [1, 1, 0], [0, 0, 1], [1, 1, 0]]>
80+
// PACK-PEEL-4-LEVEL-1-CORE{LITERAL}: #packingConfig = #amdaie.packing_config<packing_config = [{packedSizes = [32, 16, 64], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1], [1, 0], [1, 0]]}, {packedSizes = [0, 0, 0, 4, 4, 8], transposePackIndices = [0, 1, 2], unpackEmpty = [false, false, true], innerPerm = [[0, 1], [1, 0], [0, 1]], outerPerm = [[0, 1, 3, 2], [0, 1, 3, 2], [0, 1, 3, 2]]}]>
81+
func.func @matmul_dispatch_0_matmul_32x32x128_i32() {
82+
%c0_i32 = arith.constant 0 : i32
83+
%c0 = arith.constant 0 : index
84+
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<32x128xi32>>
85+
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x32xi32>>
86+
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x32xi32>>
87+
%3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 128], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<32x128xi32>> -> tensor<32x128xi32>
88+
%4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [128, 32], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<128x32xi32>> -> tensor<128x32xi32>
89+
%5 = tensor.empty() : tensor<32x32xi32>
90+
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<32x32xi32>) -> tensor<32x32xi32>
91+
%7 = linalg.matmul ins(%3, %4 : tensor<32x128xi32>, tensor<128x32xi32>) outs(%6 : tensor<32x32xi32>) -> tensor<32x32xi32>
92+
iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : tensor<32x32xi32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<32x32xi32>>
93+
return
94+
}

0 commit comments

Comments
 (0)