Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ void GPUApplyTilingLevelPass::runOnOperation() {
if (tilingLevel != IREE::GPU::TilingLevel::Reduction &&
tilingLevel != IREE::GPU::TilingLevel::Thread &&
tilingLevel != IREE::GPU::TilingLevel::Subgroup &&
tilingLevel != IREE::GPU::TilingLevel::PartialReduction) {
tilingLevel != IREE::GPU::TilingLevel::PartialReduction &&
tilingLevel != IREE::GPU::TilingLevel::Serial) {
Comment on lines 65 to +69
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use llvm::is_contained

funcOp.emitError() << "unsupported tiling level: "
<< IREE::GPU::stringifyEnum(tilingLevel) << "\n";
return signalPassFailure();
Expand Down
2 changes: 2 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,8 @@ def GPUApplyTilingLevelPass :
"Tile and fuse all annotated ops to serial loops"),
clEnumValN(IREE::GPU::TilingLevel::PartialReduction, "partial_reduction",
"Tile and fuse all annotated ops to partial reduuction loops"),
clEnumValN(IREE::GPU::TilingLevel::Serial, "serial",
"Tile and fuse all annotated ops to serial loops"),
clEnumValN(IREE::GPU::TilingLevel::Thread, "thread",
"Tile and fuse all annotated ops to threads"),
clEnumValN(IREE::GPU::TilingLevel::Subgroup, "subgroup",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=subgroup}, canonicalize, cse))" %s | FileCheck %s --check-prefix=SUBGROUP
// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=partial_reduction}, canonicalize, cse))" %s | FileCheck %s --check-prefix=PARTRED
// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{normalize-loops}, canonicalize, cse))" %s | FileCheck %s --check-prefix=NORM-REDUCTION
// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=serial}, canonicalize, cse))" %s | FileCheck %s --check-prefix=SERIAL

#config = #iree_gpu.lowering_config<{thread = [2, 16], subgroup = [2, 16]}>
#map = affine_map<(d0, d1) -> (d0, d1)>
Expand Down Expand Up @@ -654,3 +655,27 @@ func.func @no_swap_collapse_shape_with_extract_slice(%arg0: tensor<288x3x3x32xf3
// NORM-REDUCTION: tensor.extract_slice {{.*}} tensor<2592x32xf32> to tensor<2592x?xf32>
// NORM-REDUCTION-NOT: tensor.collapse_shape
// NORM-REDUCTION: linalg.copy

// -----

#config = #iree_gpu.lowering_config<{serial = [0, 16]}>
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @serial_tiling(%3: tensor<4x256xf32>, %4: tensor<4x256xf32>, %5: tensor<4x256xf32>) -> tensor<4x256xf32> {
%6 = linalg.generic {
indexing_maps = [#map, #map, #map],
iterator_types = ["parallel", "parallel"]
} ins(%3, %4 : tensor<4x256xf32>, tensor<4x256xf32>) outs(%5 : tensor<4x256xf32>) attrs = {lowering_config = #config} {
^bb0(%in: f32, %in_0: f32, %out: f32):
%7 = arith.addf %in, %in_0 : f32
linalg.yield %7 : f32
} -> tensor<4x256xf32>
return %6 : tensor<4x256xf32>
}
}

// SERIAL-LABEL: func.func @serial_tiling
// SERIAL: scf.forall ({{.*}}) = (0) to (256) step (16)
// SERIAL: linalg.generic
// SERIAL: scf.forall.in_parallel
// SERIAL-NOT: mapping
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,10 @@ LogicalResult applyTileAndFuseToEachRoot(
tilingOptions.setMapping(llvm::to_vector(llvm::reverse(mapping)));
}

if (tilingLevel == IREE::GPU::TilingLevel::Serial) {
tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
}

if (tilingLevel == IREE::GPU::TilingLevel::PartialReduction) {
tilingOptions.setReductionTilingStrategy(
ReductionTilingStrategy::PartialReductionOuterReduction);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1791,6 +1791,7 @@ std::array<int64_t, 3> TargetAttr::getMaximumWorkgroupCount() const {
constexpr StringLiteral kWorkgroupLevelName = "workgroup";
constexpr StringLiteral kPartialReductionLevelName = "partial_reduction";
constexpr StringLiteral kReductionLevelName = "reduction";
constexpr StringLiteral kSerialLevelName = "serial";
constexpr StringLiteral kThreadLevelName = "thread";
constexpr StringLiteral kSubgroupLevelName = "subgroup";
constexpr StringLiteral kLaneLevelName = "lane";
Expand All @@ -1803,6 +1804,8 @@ StringRef getTilingLevelName(GPU::TilingLevel level) {
return kPartialReductionLevelName;
case GPU::TilingLevel::Reduction:
return kReductionLevelName;
case GPU::TilingLevel::Serial:
return kSerialLevelName;
case GPU::TilingLevel::Thread:
return kThreadLevelName;
case GPU::TilingLevel::Subgroup:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -367,9 +367,10 @@ def IREEGPU_ScaledMMAIntrinsic : IREEGPU_I32Enum<"ScaledMMAIntrinsic",
def Workgroup : I32EnumAttrCase<"Workgroup", 0>;
def Reduction : I32EnumAttrCase<"Reduction", 1>;
def PartialReduction : I32EnumAttrCase<"PartialReduction", 2>;
def Thread : I32EnumAttrCase<"Thread", 3>;
def Subgroup : I32EnumAttrCase<"Subgroup", 4>;
def Lane : I32EnumAttrCase<"Lane", 5>;
def Serial : I32EnumAttrCase<"Serial", 3>;
def Thread : I32EnumAttrCase<"Thread", 4>;
def Subgroup : I32EnumAttrCase<"Subgroup", 5>;
def Lane : I32EnumAttrCase<"Lane", 6>;

/// Enum descriptor for the set of tiling levels for GPU pass pipelines.
/// Note that `Thread` tiling is mutually exclusive with `Subgroup` and
Expand All @@ -380,6 +381,7 @@ def IREEGPU_TilingLevel : IREEGPU_I32EnumAttr<"TilingLevel",
Workgroup,
Reduction,
PartialReduction,
Serial,
Thread,
Subgroup,
Lane
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,12 @@ getVectorDistributeReductionConfig(
// Set the configuration for the operation with no reduction dims.
// The workgroup tile sizes are set by the reduction operation.
if (reductionDims.empty()) {
SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 1);
SmallVector<int64_t> serialTileSizes(op.getNumLoops(), 1);

// For the shared wgp dimension, set the reduction tile sizes to be zero.
// For the shared wgp dimension, set the serial tile sizes to be zero.
// Copy the workgroup tiles sizes from the sharedWgpDims.
for (const auto &[dim, tile_size] : sharedWgpTiles) {
reductionTileSizes[dim] = 0;
serialTileSizes[dim] = 0;
workgroupTileSizes[dim] = tile_size;
}

Expand Down Expand Up @@ -214,7 +214,7 @@ getVectorDistributeReductionConfig(
subgroupBasis = 1;
}

reductionTileSizes[parallelDims.back()] = lastDimReductionTileSize;
serialTileSizes[parallelDims.back()] = lastDimReductionTileSize;
threadTileSizes[parallelDims.back()] = threadLoads;
threadCounts[parallelDims.back()] = threadBasis;
subGroupCounts[parallelDims.back()] = subgroupBasis;
Expand All @@ -227,7 +227,7 @@ getVectorDistributeReductionConfig(

NamedAttribute configAttrs[] = {
NamedAttribute("workgroup", b.getI64ArrayAttr(workgroupTileSizes)),
NamedAttribute("reduction", b.getI64ArrayAttr(reductionTileSizes)),
NamedAttribute("serial", b.getI64ArrayAttr(serialTileSizes)),
NamedAttribute("thread", b.getI64ArrayAttr(threadTileSizes)),
NamedAttribute("lane_basis", laneBasisAttr),
NamedAttribute("subgroup_basis", subgroupBasisAttr)};
Expand Down
11 changes: 11 additions & 0 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -804,6 +804,17 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
funcPassManager.addPass(createCSEPass());
}

// Tile to serial loops.
{
GPUApplyTilingLevelPassOptions options;
options.tilingLevel = IREE::GPU::TilingLevel::Serial;
options.allowZeroSlices = true;
funcPassManager.addPass(createGPUApplyTilingLevelPass(options));
funcPassManager.addPass(affine::createLoopCoalescingPass());
funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
}

funcPassManager.addPass(IREE::LinalgExt::createDecomposeAttentionPass());
funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ func.func @test_multiple_reduction() {
// CHECK-SAME: outs(%{{.*}} : tensor<2x32x10x16384xf32>)
// CHECK-SAME: attrs = {lowering_config = #iree_gpu.lowering_config<{
// CHECK-SAME: lane_basis = {{\[}}[1, 1, 1, 64], [0, 1, 2, 3]]
// CHECK-SAME: reduction = [0, 0, 1, 8192],
// CHECK-SAME: serial = [0, 0, 1, 8192],
// CHECK-SAME: subgroup_basis = {{\[}}[1, 1, 1, 16], [0, 1, 2, 3]],
// CHECK-SAME: thread = [0, 0, 0, 8],

Expand Down Expand Up @@ -248,7 +248,7 @@ func.func @test_multiple_stores(%arg0: !iree_tensor_ext.dispatch.tensor<readonly
// CHECK: linalg.generic
// CHECK-SAME: attrs = {lowering_config = #iree_gpu.lowering_config<{
// CHECK-SAME: lane_basis = {{\[}}[1, 64], [0, 1]],
// CHECK-SAME: reduction = [0, 4096],
// CHECK-SAME: serial = [0, 4096],
// CHECK-SAME: subgroup_basis = {{\[}}[1, 16], [0, 1]],
// CHECK-SAME: thread = [0, 4],
// CHECK-SAME: workgroup = [1, 0]
Expand Down
Loading