Skip to content

Commit 6417b4a

Browse files
authored
[FoldLinearDims] Support folding with static non-zero offsets (nod-ai#1073)
1 parent 2b89dc2 commit 6417b4a

File tree

3 files changed

+74
-21
lines changed

3 files changed

+74
-21
lines changed

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEDmaUtils.cpp

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -332,30 +332,52 @@ LogicalResult foldLinearDims(
332332
newStrides.push_back(strides[strides.size() - 1]);
333333
newSizes.push_back(sizes[sizes.size() - 1]);
334334

335-
for (int i = offsets.size() - 2; i >= 0; i--) {
335+
for (int i = static_cast<int>(offsets.size()) - 2; i >= 0; i--) {
336336
// Conditions for folding a dim.
337-
// 1. Offsets[i] == 0.This is required because we are dropping the offset
338-
// of the i dimension and keep newOffets[-1]
337+
// 1. Either, offsets[i] == 0 and then we can fold with any `newOffsets[-1]`
338+
// (even dynamic ones), OR offsets[i] multiplied by the respective stride,
339+
// is a multiple of the previous stride.
339340
// 2. newSizes[-1] x newStrides[-1] == strides[i]. With this we can have
340341
// newSizes[-1] = sizes[i] * newSizes[-1] , and then fold away the i
341342
// dimension
342343
// 3. checkValidSize(sizes[i] * newSizes[-1]). This allows hardware
343344
// constraints to be checked.
344345
size_t vecSize = newOffsets.size();
346+
std::optional<int64_t> maybeNewOffset = getConstantIntValue(offsets[i]);
345347
int64_t newStride = staticStrideVals[i];
346348
int64_t newSize = staticSizeVals[i];
349+
std::optional<int64_t> maybePrevOffset =
350+
getConstantIntValue(newOffsets[vecSize - 1]);
347351
int64_t prevStride = getConstantIndexOrAssert(newStrides[vecSize - 1]);
348352
int64_t prevSize = getConstantIndexOrAssert(newSizes[vecSize - 1]);
349353
int64_t dimExtent = prevStride * prevSize;
350354
// Fail if max constraints are provided, but the newly created
351355
// offsets/sizes/strides start exceeding the number of provide max
352356
// constraints as this will result in undefined behaviour.
353357
bool fitsMaxConstraint = checkValidSize(vecSize - 1, newSize * prevSize);
354-
if (fitsMaxConstraint && isConstantIntValue(offsets[i], 0) &&
355-
dimExtent == newStride) {
356-
foldableLinearDimsFound = true;
357-
newSizes[vecSize - 1] = getAsIndexOpFoldResult(ctx, newSize * prevSize);
358-
continue;
358+
if (fitsMaxConstraint && dimExtent == newStride) {
359+
// There are currently two cases supported for folding a dimension:
360+
// 1. If the offset is 0, we can fold the dimension, no matter what the
361+
// value of `newPrevOffset` is (it can be dynamic).
362+
// 2. If the offset, multiplied by the respective stride, is a multiple of
363+
// the previous stride, we can fold the dimension if we update the new
364+
// offset as well. However, in this case we need to add to new offset and
365+
// this is currently only supported for constant offsets.
366+
if (isConstantIntValue(offsets[i], 0)) {
367+
foldableLinearDimsFound = true;
368+
newSizes[vecSize - 1] = getAsIndexOpFoldResult(ctx, newSize * prevSize);
369+
continue;
370+
} else if (maybeNewOffset.has_value() && maybePrevOffset.has_value()) {
371+
// NOTE: It's guaranteed that
372+
// `(maybeNewOffset.value() * newStride) % prevStride == 0`
373+
// as `newStride == prevStride * prevSize`
374+
foldableLinearDimsFound = true;
375+
newSizes[vecSize - 1] = getAsIndexOpFoldResult(ctx, newSize * prevSize);
376+
int64_t newPrevOffset = maybePrevOffset.value() +
377+
maybeNewOffset.value() * newStride / prevStride;
378+
newOffsets[vecSize - 1] = getAsIndexOpFoldResult(ctx, newPrevOffset);
379+
continue;
380+
}
359381
}
360382
newOffsets.push_back(offsets[i]);
361383
newStrides.push_back(strides[i]);

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/AMDAIEDmaUtilsTest.cpp

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -535,8 +535,6 @@ TEST_F(FoldTest, NoLinearDimsFold) {
535535
checkFoldLinearDims({0}, {8}, {1}, {}, {0}, {8}, {1}, false);
536536
checkFoldLinearDims({0, 0}, {16, 8}, {16, 1}, {}, {0, 0}, {16, 8}, {16, 1},
537537
false);
538-
checkFoldLinearDims({8, 0}, {16, 8}, {8, 1}, {}, {8, 0}, {16, 8}, {8, 1},
539-
false);
540538
}
541539

542540
TEST_F(FoldTest, FoldLinearDims) {
@@ -546,8 +544,8 @@ TEST_F(FoldTest, FoldLinearDims) {
546544
true);
547545
checkFoldLinearDims({0, 0, 0, 0}, {4, 8, 16, 8}, {1024, 128, 8, 1}, {}, {0},
548546
{4096}, {1}, true);
549-
checkFoldLinearDims({0, 0, 8, 0}, {4, 8, 16, 8}, {1024, 128, 8, 1}, {},
550-
{8, 0}, {512, 8}, {8, 1}, true);
547+
checkFoldLinearDims({5, 3, 8, 1}, {4, 8, 16, 8}, {1024, 128, 8, 1}, {},
548+
{5569}, {4096}, {1}, true);
551549
}
552550

553551
TEST_F(FoldTest, FoldLinearDimsWithMax) {
@@ -561,9 +559,9 @@ TEST_F(FoldTest, FoldLinearDimsWithMax) {
561559
checkFoldLinearDims({0, 0, 0, 0}, {4, 8, 16, 8}, {1024, 128, 8, 1},
562560
{1024, 1024, 1024, 1024}, {0, 0}, {4, 1024}, {1024, 1},
563561
true);
564-
checkFoldLinearDims({0, 0, 8, 0}, {4, 8, 16, 8}, {1024, 128, 8, 1},
565-
{511, 511, 511, 511}, {0, 8, 0}, {4, 128, 8},
566-
{1024, 8, 1}, true);
562+
checkFoldLinearDims({4, 0, 8, 0}, {4, 8, 16, 8}, {1024, 128, 8, 1},
563+
{511, 511, 511, 511}, {32, 64}, {32, 128}, {128, 1},
564+
true);
567565
}
568566

569567
TEST_F(FoldTest, NoUnitDimsFold) {

compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/canonicalize_doubly_strided_op.mlir

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,8 @@ func.func @circular_dma_cpy_nd_unit_between_linear(%arg0: !amdaie.logicalobjectf
7777
// -----
7878

7979
// CHECK-LABEL: func.func @circular_dma_cpy_nd_non_zero_offset
80-
// CHECK: amdaie.circular_dma_cpy_nd(%{{.+}}[25, 1] [8, 16] [16, 1], %{{.+}}[5, 1, 1] [4, 2, 8] [16, 8, 1])
81-
// FOLD-SINGLE-DIMS: amdaie.circular_dma_cpy_nd(%{{.+}}[25, 1] [8, 16] [16, 1], %{{.+}}[5, 1, 1] [4, 2, 8] [16, 8, 1])
80+
// CHECK: amdaie.circular_dma_cpy_nd(%{{.+}}[401] [128] [1], %{{.+}}[89] [64] [1])
81+
// FOLD-SINGLE-DIMS: amdaie.circular_dma_cpy_nd(%{{.+}}[401] [128] [1], %{{.+}}[89] [64] [1])
8282
func.func @circular_dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
8383
%0 = amdaie.circular_dma_cpy_nd(%arg0[2, 1, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[1, 1, 1, 1] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
8484
"iree.keep"(%0) : (index) -> ()
@@ -87,6 +87,17 @@ func.func @circular_dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<
8787

8888
// -----
8989

90+
// CHECK-LABEL: func.func @circular_dma_cpy_nd_non_zero_dynamic_offset
91+
// CHECK: amdaie.circular_dma_cpy_nd(%{{.+}}[%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}] [1, 1, 8, 16] [128, 128, 16, 1], %{{.+}}[%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}] [1, 4, 2, 8] [64, 16, 8, 1])
92+
// FOLD-SINGLE-DIMS: amdaie.circular_dma_cpy_nd(%{{.+}}[%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}] [1, 1, 8, 16] [128, 128, 16, 1], %{{.+}}[%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}] [1, 4, 2, 8] [64, 16, 8, 1])
93+
func.func @circular_dma_cpy_nd_non_zero_dynamic_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, %arg2: index) {
94+
%0 = amdaie.circular_dma_cpy_nd(%arg0[%arg2, %arg2, %arg2, %arg2] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[%arg2, %arg2, %arg2, %arg2] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
95+
"iree.keep"(%0) : (index) -> ()
96+
return
97+
}
98+
99+
// -----
100+
90101
// CHECK-LABEL: func.func @circular_dma_cpy_nd_partial_non_zero_offset
91102
// CHECK: amdaie.circular_dma_cpy_nd(%{{.+}}[1] [128] [1], %{{.+}}[1] [64] [1])
92103
// FOLD-SINGLE-DIMS: amdaie.circular_dma_cpy_nd(%{{.+}}[1] [128] [1], %{{.+}}[1] [64] [1])
@@ -174,8 +185,8 @@ func.func @dma_cpy_nd_unit_between_linear(%arg0: !amdaie.logicalobjectfifo<memre
174185
// -----
175186

176187
// CHECK-LABEL: func.func @dma_cpy_nd_non_zero_offset
177-
// CHECK: amdaie.dma_cpy_nd(%{{.+}}[25, 1] [8, 16] [16, 1], %{{.+}}[5, 1, 1] [4, 2, 8] [16, 8, 1])
178-
// FOLD-SINGLE-DIMS: amdaie.dma_cpy_nd(%{{.+}}[25, 1] [8, 16] [16, 1], %{{.+}}[5, 1, 1] [4, 2, 8] [16, 8, 1])
188+
// CHECK: amdaie.dma_cpy_nd(%{{.+}}[401] [128] [1], %{{.+}}[89] [64] [1])
189+
// FOLD-SINGLE-DIMS: amdaie.dma_cpy_nd(%{{.+}}[401] [128] [1], %{{.+}}[89] [64] [1])
179190
func.func @dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
180191
%0 = amdaie.dma_cpy_nd(%arg0[1, 2, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[1, 1, 1, 1] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
181192
"iree.keep"(%0) : (index) -> ()
@@ -184,6 +195,17 @@ func.func @dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x
184195

185196
// -----
186197

198+
// CHECK-LABEL: func.func @dma_cpy_nd_non_zero_dynamic_offset
199+
// CHECK: amdaie.dma_cpy_nd(%{{.+}}[%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}] [1, 1, 8, 16] [128, 128, 16, 1], %{{.+}}[%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}] [1, 4, 2, 8] [64, 16, 8, 1])
200+
// FOLD-SINGLE-DIMS: amdaie.dma_cpy_nd(%{{.+}}[%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}] [1, 1, 8, 16] [128, 128, 16, 1], %{{.+}}[%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}] [1, 4, 2, 8] [64, 16, 8, 1])
201+
func.func @dma_cpy_nd_non_zero_dynamic_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, %arg2: index) {
202+
%0 = amdaie.dma_cpy_nd(%arg0[%arg2, %arg2, %arg2, %arg2] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[%arg2, %arg2, %arg2, %arg2] [1, 4, 2, 8] [64, 16, 8, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
203+
"iree.keep"(%0) : (index) -> ()
204+
return
205+
}
206+
207+
// -----
208+
187209
// CHECK-LABEL: func.func @dma_cpy_nd_partial_non_zero_offset
188210
// CHECK: amdaie.dma_cpy_nd(%{{.+}}[1] [128] [1], %{{.+}}[1] [64] [1])
189211
// FOLD-SINGLE-DIMS: amdaie.dma_cpy_nd(%{{.+}}[1] [128] [1], %{{.+}}[1] [64] [1])
@@ -273,8 +295,8 @@ func.func @npu_dma_cpy_nd_unit_between_linear(%arg0: !amdaie.logicalobjectfifo<m
273295
// -----
274296

275297
// CHECK-LABEL: func.func @npu_dma_cpy_nd_non_zero_offset
276-
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([25, 1] [8, 16] [16, 1], [5, 1, 1] [4, 2, 8] [16, 8, 1])
277-
// FOLD-SINGLE-DIMS: amdaie.npu.dma_cpy_nd %{{.+}}([25, 1] [8, 16] [16, 1], [5, 1, 1] [4, 2, 8] [16, 8, 1])
298+
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([401] [128] [1], [89] [64] [1])
299+
// FOLD-SINGLE-DIMS: amdaie.npu.dma_cpy_nd %{{.+}}([401] [128] [1], [89] [64] [1])
278300
func.func @npu_dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>) {
279301
%0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
280302
amdaie.npu.dma_cpy_nd %0([1, 2, 1, 1] [1, 1, 8, 16] [128, 128, 16, 1], [1, 1, 1, 1] [1, 4, 2, 8] [64, 16, 8, 1])
@@ -283,6 +305,17 @@ func.func @npu_dma_cpy_nd_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memre
283305

284306
// -----
285307

308+
// CHECK-LABEL: func.func @npu_dma_cpy_nd_dynamic_non_zero_offset
309+
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}] [1, 1, 8, 16] [128, 128, 16, 1], [%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}] [1, 4, 2, 8] [64, 16, 8, 1])
310+
// FOLD-SINGLE-DIMS: amdaie.npu.dma_cpy_nd %{{.+}}([%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}] [1, 1, 8, 16] [128, 128, 16, 1], [%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}}] [1, 4, 2, 8] [64, 16, 8, 1])
311+
func.func @npu_dma_cpy_nd_dynamic_non_zero_offset(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, %arg1: !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>, %arg2: index) {
312+
%0 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32, 1>>)
313+
amdaie.npu.dma_cpy_nd %0([%arg2, %arg2, %arg2, %arg2] [1, 1, 8, 16] [128, 128, 16, 1], [%arg2, %arg2, %arg2, %arg2] [1, 4, 2, 8] [64, 16, 8, 1])
314+
return
315+
}
316+
317+
// -----
318+
286319
// CHECK-LABEL: func.func @npu_dma_cpy_nd_partial_non_zero_offset
287320
// CHECK: amdaie.npu.dma_cpy_nd %{{.+}}([1] [128] [1], [1] [64] [1])
288321
// FOLD-SINGLE-DIMS: amdaie.npu.dma_cpy_nd %{{.+}}([1] [128] [1], [1] [64] [1])

0 commit comments

Comments
 (0)