[LLVMGPU][Codegen] Emit packed chain FMA from select multi_reductions and contracts (#21855)

efric · web-flow · commit b98c1b92cb63 · 2025-11-21T15:44:03.000-08:00
This patch teaches the vector lowering pipeline to: 1. Rewrite `vector.multi_reduction<add>` whose input is `arith.mulf` into a `vector.contract` via `vector::populateVectorReductionToContractPatterns` 2. Lower a restricted set of `vector.contract` into packed FMA chains. Previously lowering `vector.multi_reduction` of the same form produced elementwise pack-muls per K-slice and then reduced them with a left-associated, serial chain of `v_add_f{16, 32}` `(mul(a0 ,b0) + (mul(a1, b1) + … + acc` The new lowering emits a single nested FMA chain and folds the accumulation into the `math.fma` c-operand `fma(a0 ,b0, fma(a1, b1, fma(a2, b2, fma(a3, b3, acc))))` To do this, we first permute the reduction and parallel dimensions of the `LHS` and `RHS` to the order of `[reduction, ..., parallel, ...]`. The `LHS` and `RHS` are then collapsed to a 2D shape of `{Π reduction dimensions, Π parallel dimensions}`. Then we form the FMA chain by iterating backwards, seeded by the accumulator. Not all forms of `vector.contract` are suitable in the current approach. For example, when an operand drops a parallel iterator as in matmul-like contracts. We require both sides to share the same 2D tuple. Unsupported cases fall back to the existing lowering. Fixes: #21483 (variant of original issue; for [issue #21513](#21513)). --------- Signed-off-by: Eric Feng <Eric.Feng@amd.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorLowering.cpp
@@ -12,6 +12,7 @@
 #include "mlir/Dialect/Math/Transforms/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -183,6 +184,321 @@ struct SetMulAddFMF final : OpRewritePattern<vector::MultiDimReductionOp> {
   }
 };
 
+// Rewrites vector.contracts into a chain of math.fma ops when possible.
+// Starting from the innermost position of the reduction dimension,
+// the lowering emits a single nested FMA chain as follows:
+// fma(a0 ,b0, fma(a1, b1, fma(a2, b2, fma(a3, b3, acc))))
+// where ai and bi are the elements extracted from lhs and rhs vectors
+// respectively along the reduction dimension.
+//
+// Example:
+// ```mlir
+// #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
+// vector.contract
+//{
+//    indexing_maps = [#map, #map, #map1],
+//    iterator_types = ["parallel", "parallel", "reduction"],
+//    kind = #vector.kind<add>
+// }
+// %arg0, %arg1, %cst : vector<2x1x8xf16>, vector<2x1x8xf16> into
+// vector<2x1xf16>
+// ```
+//
+// ==>
+// <Extract lhs/rhs along reduction dim> then:
+// ```mlir
+// %34 = math.fma %32, %33, %cst : vector<2xf16>
+// %37 = math.fma %35, %36, %34 : vector<2xf16>
+// %40 = math.fma %38, %39, %37 : vector<2xf16>
+// %43 = math.fma %41, %42, %40 : vector<2xf16>
+// %45 = math.fma %44, %45, %43 : vector<2xf16>
+// %49 = math.fma %47, %48, %46 : vector<2xf16>
+// %52 = math.fma %50, %51, %49 : vector<2xf16>
+// %55 = math.fma %53, %54, %52 : vector<2xf16>
+// ```
+//
+// Previously, contracts of the same form lowered to elementwise multiplies
+// followed by a vector.reduce. This lowering elides the need to reduce the
+// result of the elementwise operations separately and instead accumulates
+// directly result via FMAs, offering more profitable instruction level
+// scheduling on GPUs.
+struct ContractToChainFMA final : OpRewritePattern<vector::ContractionOp> {
+  using Base::Base;
+
+  LogicalResult matchAndRewrite(vector::ContractionOp op,
+                                PatternRewriter &rewriter) const override {
+    // TODO: Add a rewrite to support relevant contractions nested in
+    // vector.mask.
+    if (op.isMasked() || op.getKind() != vector::CombiningKind::ADD) {
+      return failure();
+    }
+
+    VectorType lhsVecType = op.getLhsType();
+    VectorType rhsVecType = op.getRhsType();
+    if (lhsVecType.isScalable() || rhsVecType.isScalable()) {
+      return failure();
+    }
+
+    auto resultVecType = dyn_cast<VectorType>(op.getResultType());
+    if (!resultVecType || resultVecType.isScalable()) {
+      return failure();
+    }
+
+    auto maybeAccVecType = dyn_cast<VectorType>(op.getAccType());
+    if (maybeAccVecType && maybeAccVecType.isScalable()) {
+      return failure();
+    }
+
+    if (!isa<FloatType>(lhsVecType.getElementType())) {
+      return failure();
+    }
+
+    SmallVector<int64_t> redDims, parDims;
+    getReductionAndParallelLoopDims(op.getIteratorTypes(), redDims, parDims);
+    if (redDims.empty()) {
+      return failure();
+    }
+
+    auto elemType = getElementTypeOrSelf(op.getAccType());
+
+    Location loc = op.getLoc();
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+
+    if (lhsVecType.getElementType() != elemType) {
+      Type promotedType = lhsVecType.clone(elemType);
+      lhs = arith::ExtFOp::create(rewriter, loc, promotedType, lhs);
+      lhsVecType = cast<VectorType>(lhs.getType());
+    }
+
+    if (rhsVecType.getElementType() != elemType) {
+      Type promotedType = rhsVecType.clone(elemType);
+      rhs = arith::ExtFOp::create(rewriter, loc, promotedType, rhs);
+      rhsVecType = cast<VectorType>(rhs.getType());
+    }
+
+    // New indices: [reduction..., parallel...].
+    auto indices = llvm::to_vector(llvm::concat<int64_t>(redDims, parDims));
+
+    ArrayRef<int64_t> lhsShape = lhsVecType.getShape();
+    ArrayRef<int64_t> rhsShape = rhsVecType.getShape();
+    SmallVector<AffineMap, 4> maps = op.getIndexingMapsArray();
+    AffineMap lhsMap = maps[0];
+    AffineMap rhsMap = maps[1];
+    AffineMap accMap = maps[2];
+
+    // Broadcast operands for missing parallel dimensions.
+    unsigned numParallelDims = accMap.getNumResults();
+
+    SmallVector<int64_t> lhsTranspose, rhsTranspose;
+    lhs = broadcastMissingDims(
+        rewriter, loc, lhsMap, accMap, op.getIteratorTypes(), numParallelDims,
+        resultVecType, lhs, lhsShape, elemType, lhsTranspose);
+    rhs = broadcastMissingDims(
+        rewriter, loc, rhsMap, accMap, op.getIteratorTypes(), numParallelDims,
+        resultVecType, rhs, rhsShape, elemType, rhsTranspose);
+
+    // Apply transposes to get [reduction..., parallel...] layout.
+    lhs = vector::TransposeOp::create(rewriter, loc, lhs, lhsTranspose);
+    rhs = vector::TransposeOp::create(rewriter, loc, rhs, rhsTranspose);
+
+    SmallVector<int64_t> accPerm;
+    if (maybeAccVecType) {
+      accPerm = getPermutationFromIndexingMap(maps[2], parDims);
+    }
+
+    const size_t numRed = redDims.size();
+    auto lhsTransposedVecType = cast<VectorType>(lhs.getType());
+    int64_t lhsRedSize = productOfDims(lhsTransposedVecType, 0, numRed);
+    int64_t lhsParSize = productOfDims(lhsTransposedVecType, numRed,
+                                       lhsTransposedVecType.getRank());
+
+    // Shape-cast operands to 2D {reduction_size, parallel_size}.
+    int64_t redSize = lhsRedSize;
+    int64_t parSize = lhsParSize;
+    auto flattened2DType = VectorType::get({redSize, parSize}, elemType);
+    Value lhs2D =
+        vector::ShapeCastOp::create(rewriter, loc, flattened2DType, lhs);
+    Value rhs2D =
+        vector::ShapeCastOp::create(rewriter, loc, flattened2DType, rhs);
+
+    Value flattenedAcc;
+    auto flatAccVecType = VectorType::get({parSize}, elemType);
+    VectorType preFlattenVecType = maybeAccVecType;
+
+    if (maybeAccVecType) {
+      Value acc = op.getAcc();
+
+      if (!isIdentityPermutation(accPerm)) {
+        acc = vector::TransposeOp::create(rewriter, loc, acc, accPerm);
+        preFlattenVecType = cast<VectorType>(acc.getType());
+      }
+
+      flattenedAcc =
+          vector::ShapeCastOp::create(rewriter, loc, flatAccVecType, acc);
+    } else {
+      flattenedAcc = vector::BroadcastOp::create(rewriter, loc, flatAccVecType,
+                                                 op.getAcc());
+    }
+
+    Value resultFlat =
+        buildFMAChain(rewriter, loc, lhs2D, rhs2D, flattenedAcc, redSize);
+
+    // Restore result to original form.
+    Value result;
+    if (maybeAccVecType) {
+      Value reshaped = vector::ShapeCastOp::create(
+          rewriter, loc, preFlattenVecType, resultFlat);
+
+      if (!isIdentityPermutation(accPerm)) {
+        result = vector::TransposeOp::create(rewriter, loc, maybeAccVecType,
+                                             reshaped, invert(accPerm));
+      } else {
+        result = reshaped;
+      }
+
+    } else {
+      result = vector::ExtractOp::create(rewriter, loc, resultFlat, 0);
+    }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+private:
+  static Value broadcastMissingDims(
+      PatternRewriter &rewriter, Location loc, AffineMap operandMap,
+      AffineMap accMap, ArrayAttr iteratorTypes, unsigned numParallelDims,
+      VectorType resultType, Value operand, ArrayRef<int64_t> operandShape,
+      Type elemType, SmallVectorImpl<int64_t> &transpose) {
+    SmallVector<int64_t> reductionDims =
+        getReductionIndex(operandMap, iteratorTypes);
+
+    unsigned numDimToBroadcast =
+        numParallelDims - (operandMap.getNumResults() - reductionDims.size());
+
+    SmallVector<int64_t> broadcastDims;
+
+    for (int64_t dim : reductionDims) {
+      transpose.push_back(numDimToBroadcast + dim);
+    }
+
+    for (unsigned i = 0; i < numParallelDims; ++i) {
+      unsigned iterDim = accMap.getDimPosition(i);
+
+      std::optional<unsigned> opDim = getDimPosition(operandMap, iterDim);
+      if (opDim) {
+        transpose.push_back(numDimToBroadcast + *opDim);
+      } else {
+        broadcastDims.push_back(resultType.getDimSize(i));
+        transpose.push_back(broadcastDims.size() - 1);
+      }
+    }
+
+    Value result = operand;
+    if (!broadcastDims.empty()) {
+      llvm::append_range(broadcastDims, operandShape);
+      auto expandedType = VectorType::get(broadcastDims, elemType);
+      result = vector::BroadcastOp::create(rewriter, loc, expandedType, result);
+    }
+
+    return result;
+  }
+
+  static std::optional<unsigned> getDimPosition(AffineMap map, unsigned dim) {
+    for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
+      if (map.getDimPosition(i) == dim)
+        return i;
+    }
+    return std::nullopt;
+  }
+
+  static SmallVector<int64_t> getReductionIndex(AffineMap map,
+                                                ArrayAttr iteratorTypes) {
+    SmallVector<int64_t> dimsIdx;
+    for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
+      if (vector::isReductionIterator(iteratorTypes[map.getDimPosition(i)]))
+        dimsIdx.push_back(i);
+    }
+    return dimsIdx;
+  }
+
+  static SmallVector<int64_t> invert(ArrayRef<int64_t> perm) {
+    SmallVector<int64_t> inv(perm.size());
+    for (auto [i, p] : llvm::enumerate(perm)) {
+      inv[p] = i;
+    }
+    return inv;
+  }
+
+  static void getReductionAndParallelLoopDims(ArrayAttr iters,
+                                              SmallVectorImpl<int64_t> &red,
+                                              SmallVectorImpl<int64_t> &par) {
+    for (auto [idx, attr] : llvm::enumerate(iters)) {
+      if (vector::isReductionIterator(attr)) {
+        red.push_back(idx);
+      } else {
+        par.push_back(idx);
+      }
+    }
+  }
+
+  /// Constructs a permutation for vector.transpose from an affine map and a
+  /// reordered list of dimension.
+  ///
+  /// Example:
+  ///   map: (d0, d1, d2) -> (d0, d2, d1)
+  ///   iterator_types = ["parallel","parallel","reduction"]
+  //    ==> new dim order: [2, 0, 1]
+  ///
+  ///   Step 1: Build dim-to-result mapping from the map.
+  ///           dimToRes = [0, 2, 1] i.e {0: 0, 1: 2, 2: 1}
+  ///
+  ///   Step 2: Walk new dimension order in order to build permutation.
+  ///           indices[0]=2 -> dimToRes[2]=1
+  ///           indices[1]=0 -> dimToRes[0]=0
+  ///           indices[2]=1 -> dimToRes[1]=2
+  ///
+  ///   Result: perm = [1, 0, 2]
+  static SmallVector<int64_t>
+  getPermutationFromIndexingMap(AffineMap map, ArrayRef<int64_t> indices) {
+    SmallVector<int64_t> dimToRes(map.getNumDims());
+    for (int res = 0, e = map.getNumResults(); res != e; ++res) {
+      dimToRes[map.getDimPosition(res)] = res;
+    }
+
+    return to_vector(
+        llvm::map_range(indices, [&](int64_t i) { return dimToRes[i]; }));
+  }
+
+  static int64_t productOfDims(VectorType vt, unsigned lo, unsigned hi) {
+    int64_t p = 1;
+    for (unsigned i = lo; i < hi; ++i) {
+      p *= vt.getDimSize(i);
+    }
+    return p;
+  }
+
+  static bool isIdentityPermutation(ArrayRef<int64_t> perm) {
+    return llvm::all_of(llvm::enumerate(perm),
+                        [](auto p) { return p.value() == p.index(); });
+  }
+
+  static Value buildFMAChain(PatternRewriter &rewriter, Location loc,
+                             Value lhs2D, Value rhs2D, Value accFlat,
+                             int64_t K) {
+    Value current = accFlat;
+
+    for (int64_t k = K - 1; k >= 0; --k) {
+      Value a = vector::ExtractOp::create(rewriter, loc, lhs2D, k);
+      Value b = vector::ExtractOp::create(rewriter, loc, rhs2D, k);
+      current = math::FmaOp::create(rewriter, loc, a, b, current);
+    }
+    return current;
+  }
+};
+
 struct LLVMGPUVectorLoweringPass final
     : impl::LLVMGPUVectorLoweringPassBase<LLVMGPUVectorLoweringPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -206,6 +522,14 @@ struct LLVMGPUVectorLoweringPass final
       }
     }
 
+    {
+      RewritePatternSet patterns(ctx);
+      vector::populateVectorReductionToContractPatterns(patterns);
+      if (failed(applyPatternsGreedily(funcOp, std::move(patterns)))) {
+        return signalPassFailure();
+      }
+    }
+
     {
       // Lower high level vector operations like contract or multidim reduce ops
       // to lower level vector ops.
@@ -222,6 +546,8 @@ struct LLVMGPUVectorLoweringPass final
           contractLoweringPatterns, options.vectorContractLowering);
       contractLoweringPatterns.add<PromoteContractOperands>(
           funcOp->getContext());
+      contractLoweringPatterns.add<ContractToChainFMA>(funcOp->getContext(),
+                                                       PatternBenefit(2));
       vector::populateVectorGatherLoweringPatterns(contractLoweringPatterns);
       vector::populateVectorMaskOpLoweringPatterns(contractLoweringPatterns);
       vector::populateVectorShapeCastLoweringPatterns(contractLoweringPatterns);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir
@@ -87,7 +87,7 @@ hal.executable @dot_dispatch_0 {
 //            CHECK:   llvm.br
 //            CHECK:    llvm.load {{.*}} : !llvm.ptr<1> -> vector<32xf32>
 //   CHECK-COUNT-32:    llvm.load {{.*}} : !llvm.ptr<1> -> vector<16xf32>
-//   CHECK-COUNT-32:    llvm.intr.fmuladd({{.*}}) : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
+//   CHECK-COUNT-512:  llvm.call @__nv_fmaf({{.*}}) : (f32, f32, f32) -> f32
 //            CHECK:    llvm.store {{.*}} : vector<16xf32>, !llvm.ptr<1>
 
 // -----
@@ -151,7 +151,7 @@ hal.executable @dot_dispatch_0 {
 //   CHECK-LABEL: hal.executable public @dot_dispatch_0
 //            CHECK:   hal.executable.variant public @cuda
 //            CHECK:  llvm.br
-//   CHECK-COUNT-32:    llvm.intr.fmuladd({{.*}}) : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
+//   CHECK-COUNT-512:  llvm.call @__nv_fmaf({{.*}}) : (f32, f32, f32) -> f32
 //            CHECK:    llvm.store {{.*}} : vector<16xf32>, !llvm.ptr<1>
 
 // -----
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir
@@ -88,7 +88,7 @@ hal.executable @dot_dispatch_0 {
 //           RDNA3:   llvm.br
 //   RDNA3-COUNT-1:    llvm.load {{.*}} : !llvm.ptr<7> -> vector<32xf32>
 //  RDNA3-COUNT-32:    llvm.load {{.*}} : !llvm.ptr<7> -> vector<16xf32>
-//  RDNA3-COUNT-32:    llvm.intr.fmuladd({{.*}}) : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
+//  RDNA3-COUNT-32:    llvm.intr.fma({{.*}}) : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
 //   RDNA3-COUNT-1:    llvm.store {{.*}} : vector<16xf32>, !llvm.ptr<7>
 //           RDNA3:   llvm.br
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_lowering.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/vector_lowering.mlir