iree-org · Groverkss · Oct 29, 2025 · kuhar · Oct 30, 2025
@@ -65,7 +65,8 @@ void GPUApplyTilingLevelPass::runOnOperation() {
   if (tilingLevel != IREE::GPU::TilingLevel::Reduction &&
       tilingLevel != IREE::GPU::TilingLevel::Thread &&
       tilingLevel != IREE::GPU::TilingLevel::Subgroup &&
-      tilingLevel != IREE::GPU::TilingLevel::PartialReduction) {
+      tilingLevel != IREE::GPU::TilingLevel::PartialReduction &&
+      tilingLevel != IREE::GPU::TilingLevel::Serial) {
     funcOp.emitError() << "unsupported tiling level: "
                        << IREE::GPU::stringifyEnum(tilingLevel) << "\n";
     return signalPassFailure();

@@ -345,6 +345,8 @@ def GPUApplyTilingLevelPass :
                          "Tile and fuse all annotated ops to serial loops"),
               clEnumValN(IREE::GPU::TilingLevel::PartialReduction, "partial_reduction",
                          "Tile and fuse all annotated ops to partial reduuction loops"),
+              clEnumValN(IREE::GPU::TilingLevel::Serial, "serial",
+                         "Tile and fuse all annotated ops to serial loops"),
               clEnumValN(IREE::GPU::TilingLevel::Thread, "thread",
                          "Tile and fuse all annotated ops to threads"),
               clEnumValN(IREE::GPU::TilingLevel::Subgroup, "subgroup",

@@ -4,6 +4,7 @@
 // RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=subgroup}, canonicalize, cse))" %s | FileCheck %s --check-prefix=SUBGROUP
 // RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=partial_reduction}, canonicalize, cse))" %s | FileCheck %s --check-prefix=PARTRED
 // RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{normalize-loops}, canonicalize, cse))" %s | FileCheck %s --check-prefix=NORM-REDUCTION
+// RUN: iree-opt --split-input-file --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level{tiling-level=serial}, canonicalize, cse))" %s | FileCheck %s --check-prefix=SERIAL
 
 #config = #iree_gpu.lowering_config<{thread = [2, 16], subgroup = [2, 16]}>
 #map = affine_map<(d0, d1) -> (d0, d1)>
@@ -654,3 +655,27 @@ func.func @no_swap_collapse_shape_with_extract_slice(%arg0: tensor<288x3x3x32xf3
 //       NORM-REDUCTION:     tensor.extract_slice {{.*}} tensor<2592x32xf32> to tensor<2592x?xf32>
 //   NORM-REDUCTION-NOT:     tensor.collapse_shape
 //       NORM-REDUCTION:     linalg.copy
+
+// -----
+
+#config = #iree_gpu.lowering_config<{serial = [0, 16]}>
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+  func.func @serial_tiling(%3: tensor<4x256xf32>, %4: tensor<4x256xf32>, %5: tensor<4x256xf32>) -> tensor<4x256xf32> {
+    %6 = linalg.generic {
+      indexing_maps = [#map, #map, #map],
+      iterator_types = ["parallel", "parallel"]
+      } ins(%3, %4 : tensor<4x256xf32>, tensor<4x256xf32>) outs(%5 : tensor<4x256xf32>) attrs =  {lowering_config = #config} {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %7 = arith.addf %in, %in_0 : f32
+      linalg.yield %7 : f32
+    } -> tensor<4x256xf32>
+    return %6 : tensor<4x256xf32>
+  }
+}
+
+// SERIAL-LABEL: func.func @serial_tiling
+// SERIAL: scf.forall ({{.*}}) = (0) to (256) step (16)
+// SERIAL: linalg.generic
+// SERIAL: scf.forall.in_parallel
+// SERIAL-NOT: mapping
@@ -329,6 +329,10 @@ LogicalResult applyTileAndFuseToEachRoot(
       tilingOptions.setMapping(llvm::to_vector(llvm::reverse(mapping)));
     }
 
+    if (tilingLevel == IREE::GPU::TilingLevel::Serial) {
+      tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
+    }
+
     if (tilingLevel == IREE::GPU::TilingLevel::PartialReduction) {
       tilingOptions.setReductionTilingStrategy(
           ReductionTilingStrategy::PartialReductionOuterReduction);

@@ -1791,6 +1791,7 @@ std::array<int64_t, 3> TargetAttr::getMaximumWorkgroupCount() const {
 constexpr StringLiteral kWorkgroupLevelName = "workgroup";
 constexpr StringLiteral kPartialReductionLevelName = "partial_reduction";
 constexpr StringLiteral kReductionLevelName = "reduction";
+constexpr StringLiteral kSerialLevelName = "serial";
 constexpr StringLiteral kThreadLevelName = "thread";
 constexpr StringLiteral kSubgroupLevelName = "subgroup";
 constexpr StringLiteral kLaneLevelName = "lane";
@@ -1803,6 +1804,8 @@ StringRef getTilingLevelName(GPU::TilingLevel level) {
     return kPartialReductionLevelName;
   case GPU::TilingLevel::Reduction:
     return kReductionLevelName;
+  case GPU::TilingLevel::Serial:
+    return kSerialLevelName;
   case GPU::TilingLevel::Thread:
     return kThreadLevelName;
   case GPU::TilingLevel::Subgroup:

@@ -367,9 +367,10 @@ def IREEGPU_ScaledMMAIntrinsic : IREEGPU_I32Enum<"ScaledMMAIntrinsic",
 def Workgroup : I32EnumAttrCase<"Workgroup", 0>;
 def Reduction : I32EnumAttrCase<"Reduction", 1>;
 def PartialReduction : I32EnumAttrCase<"PartialReduction", 2>;
-def Thread : I32EnumAttrCase<"Thread", 3>;
-def Subgroup : I32EnumAttrCase<"Subgroup", 4>;
-def Lane : I32EnumAttrCase<"Lane", 5>;
+def Serial : I32EnumAttrCase<"Serial", 3>;
+def Thread : I32EnumAttrCase<"Thread", 4>;
+def Subgroup : I32EnumAttrCase<"Subgroup", 5>;
+def Lane : I32EnumAttrCase<"Lane", 6>;
 
 /// Enum descriptor for the set of tiling levels for GPU pass pipelines.
 /// Note that `Thread` tiling is mutually exclusive with `Subgroup` and
@@ -380,6 +381,7 @@ def IREEGPU_TilingLevel : IREEGPU_I32EnumAttr<"TilingLevel",
       Workgroup,
       Reduction,
       PartialReduction,
+      Serial,
       Thread,
       Subgroup,
       Lane

@@ -166,12 +166,12 @@ getVectorDistributeReductionConfig(
   // Set the configuration for the operation with no reduction dims.
   // The workgroup tile sizes are set by the reduction operation.
   if (reductionDims.empty()) {
-    SmallVector<int64_t> reductionTileSizes(op.getNumLoops(), 1);
+    SmallVector<int64_t> serialTileSizes(op.getNumLoops(), 1);
 
-    // For the shared wgp dimension, set the reduction tile sizes to be zero.
+    // For the shared wgp dimension, set the serial tile sizes to be zero.
     // Copy the workgroup tiles sizes from the sharedWgpDims.
     for (const auto &[dim, tile_size] : sharedWgpTiles) {
-      reductionTileSizes[dim] = 0;
+      serialTileSizes[dim] = 0;
       workgroupTileSizes[dim] = tile_size;
     }
 
@@ -214,7 +214,7 @@ getVectorDistributeReductionConfig(
       subgroupBasis = 1;
     }
 
-    reductionTileSizes[parallelDims.back()] = lastDimReductionTileSize;
+    serialTileSizes[parallelDims.back()] = lastDimReductionTileSize;
     threadTileSizes[parallelDims.back()] = threadLoads;
     threadCounts[parallelDims.back()] = threadBasis;
     subGroupCounts[parallelDims.back()] = subgroupBasis;
@@ -227,7 +227,7 @@ getVectorDistributeReductionConfig(
 
     NamedAttribute configAttrs[] = {
         NamedAttribute("workgroup", b.getI64ArrayAttr(workgroupTileSizes)),
-        NamedAttribute("reduction", b.getI64ArrayAttr(reductionTileSizes)),
+        NamedAttribute("serial", b.getI64ArrayAttr(serialTileSizes)),
         NamedAttribute("thread", b.getI64ArrayAttr(threadTileSizes)),
         NamedAttribute("lane_basis", laneBasisAttr),
         NamedAttribute("subgroup_basis", subgroupBasisAttr)};

@@ -804,6 +804,17 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
     funcPassManager.addPass(createCSEPass());
   }
 
+  // Tile to serial loops.
+  {
+    GPUApplyTilingLevelPassOptions options;
+    options.tilingLevel = IREE::GPU::TilingLevel::Serial;
+    options.allowZeroSlices = true;
+    funcPassManager.addPass(createGPUApplyTilingLevelPass(options));
+    funcPassManager.addPass(affine::createLoopCoalescingPass());
+    funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
+    funcPassManager.addPass(createCSEPass());
+  }
+
   funcPassManager.addPass(IREE::LinalgExt::createDecomposeAttentionPass());
   funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());

@@ -174,7 +174,7 @@ func.func @test_multiple_reduction() {
 // CHECK-SAME:    outs(%{{.*}} : tensor<2x32x10x16384xf32>)
 // CHECK-SAME:    attrs =  {lowering_config = #iree_gpu.lowering_config<{
 // CHECK-SAME:              lane_basis = {{\[}}[1, 1, 1, 64], [0, 1, 2, 3]]
-// CHECK-SAME:              reduction = [0, 0, 1, 8192],
+// CHECK-SAME:              serial = [0, 0, 1, 8192],
 // CHECK-SAME:              subgroup_basis = {{\[}}[1, 1, 1, 16], [0, 1, 2, 3]],
 // CHECK-SAME:              thread = [0, 0, 0, 8],
 
@@ -248,7 +248,7 @@ func.func @test_multiple_stores(%arg0: !iree_tensor_ext.dispatch.tensor<readonly
 //       CHECK:   linalg.generic
 //  CHECK-SAME:      attrs =  {lowering_config = #iree_gpu.lowering_config<{
 //  CHECK-SAME:               lane_basis = {{\[}}[1, 64], [0, 1]],
-//  CHECK-SAME:               reduction = [0, 4096],
+//  CHECK-SAME:               serial = [0, 4096],
 //  CHECK-SAME:               subgroup_basis = {{\[}}[1, 16], [0, 1]],
 //  CHECK-SAME:               thread = [0, 4],
 //  CHECK-SAME:               workgroup = [1, 0]