diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp index 3a76f44dea692..1d4c7e89c724e 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp @@ -6,8 +6,11 @@ #include "iree/compiler/Codegen/Common/EncodingUtils.h" #include "iree/compiler/Codegen/Common/GPU/Passes.h" +#include "llvm/ADT/SmallVector.h" #include "mlir/Dialect/MemRef/Transforms/Transforms.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tensor/Transforms/Transforms.h" +#include "mlir/IR/BuiltinTypes.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #define DEBUG_TYPE "iree-codegen-gpu-materialize-encoding" @@ -36,8 +39,10 @@ getIntrinsicVectorSize(TypeRange elementTypes, int64_t roleIdx) { Type rhs = elementTypes[1]; Type out = elementTypes[2]; if (lhs.isF32() && rhs.isF32() && out.isF32()) { - if (roleIdx == 0 || roleIdx == 1) return std::make_pair(1, 1); - if (roleIdx == 2) return std::make_pair(4, 1); + if (roleIdx == 0 || roleIdx == 1) + return std::make_pair(1, 1); + if (roleIdx == 2) + return std::make_pair(4, 1); } return std::nullopt; } @@ -144,6 +149,7 @@ struct GPUSetEncodingOpLoweringConversion auto elemTypes = llvm::map_to_vector( encoding.getElementTypes().getValue(), [](Attribute a) { return cast(a).getValue(); }); + auto loc = encodingOp.getLoc(); std::optional intrinsicShape = getIntrinsicSize(elemTypes); std::optional> intrinsicVectorShape = getIntrinsicVectorSize(elemTypes, roleIdx); @@ -152,18 +158,18 @@ struct GPUSetEncodingOpLoweringConversion } SmallVector targetShape; // for unrolling - switch(roleIdx) { - case 0: - targetShape = {intrinsicShape->M, intrinsicShape->K}; - break; - case 1: - targetShape = {intrinsicShape->N, intrinsicShape->K}; - break; - case 2: - targetShape = {intrinsicShape->M, intrinsicShape->N}; - break; - default: - return failure(); + switch (roleIdx) { + case 0: // A + targetShape = {intrinsicShape->M, intrinsicShape->K}; + break; + case 1: // B + targetShape = {intrinsicShape->N, intrinsicShape->K}; + break; + case 2: // C + targetShape = {intrinsicShape->M, intrinsicShape->N}; + break; + default: + return failure(); } assert(innerTiles.size() == targetShape.size()); @@ -175,15 +181,37 @@ struct GPUSetEncodingOpLoweringConversion assert(packedShape == targetShape); } - // TODO(lialan): create expand_shape. Take LHS as an example: - // 16x4xf32 -> 16x1x4x1. Because the vector size used in the intrinsic is - // 1x1. - // For C-Matrix (i.e., ACC), it is 16x16xf32 -> 4x4x16x1xf32. Because the - // vector size is 4x1. + // Check that the dimensions of the matrix can be divided by the tile shape, + // if not then bail out. + auto sourceType = encodingOp.getSourceType().getShape(); + assert(sourceType.size() == 2); + if (sourceType[0] % innerTiles[0] == 0 || + sourceType[1] % innerTiles[1] == 0) { + return failure(); + } + + // Create expand_shape + llvm::SmallVector expandShapeShape; + auto [iM, iK] = *intrinsicVectorShape; + auto oM = sourceType[0] / iM; + auto oK = sourceType[1] / iK; + expandShapeShape = {oM, iM, oK, iK}; + assert(expandShapeShape.size() == 4); + RankedTensorType expandShapeType = + RankedTensorType::Builder(encodingOp.getSourceType()) + .setShape(expandShapeShape); + Value expandShapeOp = rewriter.create( + loc, expandShapeType, packOp->getResult()); - // TODO(lialan): create linalg.transpose op. + // create linalg.transpose // LHS: 16x1x4x1 -> 4x16x1x1 (perm = [2, 0, 3, 1]) // ACC: 4x4x16x1 -> 4x16x4x1 (perm = [0, 2, 1, 3]) + auto permutation = roleIdx == 2 ? ArrayRef{0, 2, 1, 3} + : ArrayRef{2, 0, 3, 1}; + auto emptyTensor = rewriter.create( + loc, expandShapeShape, encodingOp.getSourceType()); + [[maybe_unused]] auto transposeOp = rewriter.create( + loc, expandShapeOp, emptyTensor, permutation); // TODO(hanchung): We want to make the shape consistent, so we need to // collpase and expand the shape. This is the shape we materialize for Flow