Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU][DT] Switch to query MMA intrinsics from IREE::GPU::TargetAttr. #18241

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@

#include "iree/compiler/Codegen/Common/EncodingUtils.h"
#include "iree/compiler/Codegen/Common/GPU/Passes.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h"
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Dialect/Encoding/IR/EncodingDialect.h"
#include "llvm/ADT/SmallVector.h"
#include "mlir/Dialect/MemRef/Transforms/Transforms.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
Expand Down Expand Up @@ -35,14 +40,16 @@ static std::optional<TileMxNxK> getIntrinsicSize(TypeRange elementTypes) {
return std::nullopt;
}

// TODO: Query the value from GPU attributes.
// TODO: Define a struct with meaningful name for the pair.
SmallVector<int64_t> getIntrinsicVectorSize(TypeRange elementTypes,
int64_t roleIdx) {
Type lhs = elementTypes[0];
Type rhs = elementTypes[1];
Type out = elementTypes[2];
if (lhs.isF32() && rhs.isF32() && out.isF32()) {
/// Returns the corresponding native vector sizes defined by the `mma`
/// intrinsic.
static SmallVector<int64_t> getIntrinsicVectorSize(IREE::GPU::MMAAttr mma,
int64_t roleIdx) {
if (mma.getIntrinsic().getValue() ==
IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x4_F32) {
// TODO: Query the value from GPU attributes.
if (roleIdx == 0 || roleIdx == 1) {
return {1, 1};
}
if (roleIdx == 0 || roleIdx == 1) {
return {1, 1};
}
Expand All @@ -55,13 +62,11 @@ SmallVector<int64_t> getIntrinsicVectorSize(TypeRange elementTypes,

// Given encoding's role index and element types, return the transpose
// permutation used in GPU materialization.
SmallVector<int64_t> getTransposePermutation(int64_t roleIdx,
TypeRange elementTypes) {
// For now, check that all types are f32:
Type lhs = elementTypes[0];
Type rhs = elementTypes[1];
Type out = elementTypes[2];
if (!lhs.isF32() || !rhs.isF32() || !out.isF32()) {
static SmallVector<int64_t> getTransposePermutation(IREE::GPU::MMAAttr mma,
int64_t roleIdx) {
// TODO: Support other intrinsics.
if (mma.getIntrinsic().getValue() !=
IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x4_F32) {
return {};
}

Expand All @@ -81,27 +86,33 @@ SmallVector<int64_t> getTransposePermutation(int64_t roleIdx,
}
}

// TODO(hanchung): Pass an ExecutableTargetAttr attribute for the target
// encoding. Here we assume that every mfma op is available.
// TODO(hanchung): Handle wmma ops.
static SmallVector<TileMxNxK> enumerateMatmulTileMxNxK(TypeRange elementTypes) {
static std::optional<IREE::GPU::MMAAttr>
enumerateMmaIntrinsic(TypeRange elementTypes, IREE::GPU::TargetAttr target) {
assert(elementTypes.size() == 3);
Type lhs = elementTypes[0];
Type rhs = elementTypes[1];
Type out = elementTypes[2];
if (lhs.isF32() && rhs.isF32() && out.isF32()) {
// TODO: Take subgroup_size into account, so we can have more unrolling.
// TODO: Take the bitwidth of load into account, so we can have correct
// unrolling factor for K-dimension.
return {TileMxNxK{16, 16, 4}}; // Aim to use mfma_f32_16x16x4_f32 intrinsic.
for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
IREE::GPU::MMAIntrinsic type = mma.getIntrinsic().getValue();
// TODO: Drop this once all intrinsics are supported.
if (type != IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x4_F32) {
continue;
}

auto [aType, bType, cType] = mma.getABCElementTypes();
if (lhs != aType || rhs != bType || out != cType) {
continue;
}
return mma;
}

// Fallback - no architecture-optimized tile size for this case.
return {};
return std::nullopt;
}

static FailureOr<MaterializeEncodingInfo>
materializeEncodingForTarget(RankedTensorType tensorType) {
materializeEncodingForTarget(RankedTensorType tensorType,
IREE::HAL::ExecutableTargetAttr targetAttr) {
auto encoding =
dyn_cast_or_null<IREE::Encoding::EncodingAttr>(tensorType.getEncoding());
if (!encoding) {
Expand All @@ -113,28 +124,31 @@ materializeEncodingForTarget(RankedTensorType tensorType) {
cDims->n.size() > 1 || cDims->k.size() > 1) {
return failure();
}

// Enumerate available tile shapes for the given encoding and target.
IREE::GPU::TargetAttr gpuTargetAttr = getGPUTargetAttr(targetAttr);
auto elementTypes = llvm::to_vector(
llvm::map_range(encoding.getElementTypes().getValue(), [](Attribute a) {
return cast<TypeAttr>(a).getValue();
}));
SmallVector<TileMxNxK> enumeratedTileMxNxK =
enumerateMatmulTileMxNxK(elementTypes);
if (enumeratedTileMxNxK.empty()) {
std::optional<IREE::GPU::MMAAttr> mma =
enumerateMmaIntrinsic(elementTypes, gpuTargetAttr);
if (!mma) {
return failure();
}

// Map the matmul TileMxNxK to an actual tile shape for the tensor at hand,
// based on its operand index in the matmul.
// TODO: Support unrolling.
auto rank = tensorType.getRank();

auto encodingInfo =
getEncodingInfoForMatmul(encoding, rank, enumeratedTileMxNxK[0]);
TileMxNxK innerTile;
std::tie(innerTile.M, innerTile.N, innerTile.K) = mma->getMNKShape();
auto encodingInfo = getEncodingInfoForMatmul(encoding, rank, innerTile);

// insert inner tile shapes and permutation info
auto roleIdx = encoding.getOperandIndex().getInt();
auto intrinsicVectorSizes = getIntrinsicVectorSize(elementTypes, roleIdx);
auto permutation = getTransposePermutation(roleIdx, elementTypes);
auto intrinsicVectorSizes = getIntrinsicVectorSize(*mma, roleIdx);
auto permutation = getTransposePermutation(*mma, roleIdx);
encodingInfo.innerTileShapes = intrinsicVectorSizes;
encodingInfo.permutation = permutation;
return encodingInfo;
Expand All @@ -146,6 +160,11 @@ struct GPUMaterializeDeviceEncodingPass final
GPUMaterializeDeviceEncodingPass> {
using GPUMaterializeDeviceEncodingPassBase::
GPUMaterializeDeviceEncodingPassBase;
void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<arith::ArithDialect, tensor::TensorDialect,
linalg::LinalgDialect, IREE::Encoding::IREEEncodingDialect,
IREE::GPU::IREEGPUDialect>();
}
void runOnOperation() override;
};

Expand Down Expand Up @@ -301,13 +320,25 @@ struct GPUSetEncodingOpLoweringConversion

} // namespace

// TODO(hanchung): Remove the wrapper after allowing the type converter to carry
// the targetAttr. For now, follow what CPU is doing.
static MaterializeEncodingFn
getMaterializeEncodingFn(IREE::HAL::ExecutableTargetAttr targetAttr) {
return
[targetAttr](
RankedTensorType tensorType) -> FailureOr<MaterializeEncodingInfo> {
return materializeEncodingForTarget(tensorType, targetAttr);
};
}

Comment on lines +323 to +333
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This TODO will be killed after we land #18242

void GPUMaterializeDeviceEncodingPass::runOnOperation() {
MLIRContext *ctx = &getContext();
FunctionOpInterface funcOp = getOperation();
auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
{
RewritePatternSet patterns(ctx);
MaterializeEncodingTypeConverter typeConverter(
materializeEncodingForTarget);
getMaterializeEncodingFn(targetAttr));
MaterializeEncodingConversionTarget target(*funcOp.getContext());
MaterializeEncodingValueFn materializeEncodingValueFn =
[](RankedTensorType, OpBuilder,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ iree_lit_test_suite(
"gpu_distribute_shared_memory.mlir",
"gpu_generalize_named_ops.mlir",
"gpu_lower_to_ukernels.mlir",
"gpu_materialize_encoding.mlir",
"gpu_nested_layout_contract_amdgpu.mlir",
"gpu_nested_layout_vector_distribution.mlir",
"gpu_pipeline.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ iree_lit_test_suite(
"gpu_distribute_shared_memory.mlir"
"gpu_generalize_named_ops.mlir"
"gpu_lower_to_ukernels.mlir"
"gpu_materialize_encoding.mlir"
"gpu_nested_layout_contract_amdgpu.mlir"
"gpu_nested_layout_vector_distribution.mlir"
"gpu_pipeline.mlir"
Expand Down
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know why it is marked as a new file. I ran git mv ... when I created the change. There are no much changes in the file. The main difference is that it defines a target attribute and add it to func op's attribute.

It also pads checks with spaces, so the code can start on the same column.

Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-materialize-device-encoding))" --split-input-file %s | FileCheck %s

//-----------------------------------------------------------------------------
// 1. MFMA_F32_16x16x4_F32
//-----------------------------------------------------------------------------

#encoding = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [f32, f32, f32], original_type = tensor<255x513xf32>,
user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
round_dims_to = array<i64: 16, 16, 16>>
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need to embed this whole structure in test if you use --iree-gpu-test-target=cdna3 or something to have it "autopouplate". You'd need to use getGPUTargetAttr utility function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a little tricky in the data-tiling setup because the type converter only takes tensor type and executable target attribute. We want an unified API setup for all the backends. The IREE::GPU::TargetAttr is defined separately which makes it hard. Perhaps I can expose the getCLGPUTarget method, so I can have better setup for the test. It is also needed in my other prototype. I'll fix it in a follow-up.

iree.gpu.target = #iree_gpu.target<arch = "gfx942",
features = "",
wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8,
storage = b64|b32|b16|b8,
subgroup = shuffle|arithmetic,
dot = dp4xi8toi32,
mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>],
subgroup_size_choices = [64],
max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024,
max_workgroup_memory_bytes = 65536>>
}>
#pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
#hal.descriptor_set.binding<0, storage_buffer>,
#hal.descriptor_set.binding<1, storage_buffer>
]>
]>
func.func @set_encoding_LHS() attributes {
hal.executable.target = #executable_target_rocm_hsaco_fb
} {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xf32, #encoding>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<255x513xf32>> -> tensor<255x513xf32>
%3 = iree_encoding.set_encoding %2 : tensor<255x513xf32> -> tensor<255x513xf32, #encoding>
flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xf32, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<255x513xf32, #encoding>>
return
}

// CHECK-LABEL: func.func @set_encoding_LHS
// CHECK: %[[EMPTY:.*]] = tensor.empty() : tensor<33x64x16x4xf32>
// CHECK: %[[PACK:.*]] = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 4] into %[[EMPTY]] : tensor<255x513xf32> -> tensor<33x64x16x4xf32>
// CHECK: %[[EXPAND_LHS:.*]] = tensor.expand_shape %[[PACK]]
// CHECK-SAME: output_shape [33, 64, 16, 1, 4, 1] : tensor<33x64x16x4xf32> into tensor<33x64x16x1x4x1xf32>
// CHECK: %[[EMPTY_LHS2:.*]] = tensor.empty() : tensor<33x64x4x16x1x1xf32>
// CHECK: %[[TRANSPOSE:.*]] = linalg.transpose ins(%[[EXPAND_LHS]] : tensor<33x64x16x1x4x1xf32>) outs(%[[EMPTY_LHS2]] : tensor<33x64x4x16x1x1xf32>) permutation = [0, 1, 4, 2, 5, 3]
// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[TRANSPOSE]]
// CHECK: %[[EXPAND_LHS_2:.*]] = tensor.expand_shape %[[COLLAPSE]]
// CHECK: flow.dispatch.tensor.store %[[EXPAND_LHS_2]]

func.func @set_encoding_RHS() attributes {
hal.executable.target = #executable_target_rocm_hsaco_fb
} {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xf32, #encoding>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<255x513xf32>> -> tensor<255x513xf32>
%3 = iree_encoding.set_encoding %2 : tensor<255x513xf32> -> tensor<255x513xf32, #encoding>
flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xf32, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<255x513xf32, #encoding>>
return
}

// CHECK-LABEL: func.func @set_encoding_RHS
// CHECK: %[[EMPTY_RHS:.*]] = tensor.empty() : tensor<33x64x16x4xf32>
// CHECK: %[[PACK_RHS:.*]] = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 4] into %3 : tensor<255x513xf32> -> tensor<33x64x16x4xf32>
// CHECK: %[[EXPAND_RHS:.*]] = tensor.expand_shape %[[PACK_RHS]]
// CHECK-SAME: output_shape [33, 64, 16, 1, 4, 1] : tensor<33x64x16x4xf32> into tensor<33x64x16x1x4x1xf32>
// CHECK: %[[EMPTY_RHS2:.*]] = tensor.empty() : tensor<33x64x4x16x1x1xf32>
// CHECK: %[[TRANSPOSE_RHS:.*]] = linalg.transpose ins(%[[EXPAND_RHS]] : tensor<33x64x16x1x4x1xf32>) outs(%[[EMPTY_RHS2]] : tensor<33x64x4x16x1x1xf32>) permutation = [0, 1, 4, 2, 5, 3]
// CHECK: %[[COLLAPSE_RHS:.*]] = tensor.collapse_shape %[[TRANSPOSE_RHS]]
// CHECK: %[[EXPAND_RHS_2:.*]] = tensor.expand_shape %[[COLLAPSE_RHS]]
// CHECK: flow.dispatch.tensor.store %[[EXPAND_RHS_2]]

func.func @set_encoding_ACC() attributes {
hal.executable.target = #executable_target_rocm_hsaco_fb
} {
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xf32, #encoding>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<255x513xf32>> -> tensor<255x513xf32>
%3 = iree_encoding.set_encoding %2 : tensor<255x513xf32> -> tensor<255x513xf32, #encoding>
flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [255, 513], strides = [1, 1] : tensor<255x513xf32, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<255x513xf32, #encoding>>
return
}

// CHECK-LABEL: func.func @set_encoding_ACC
// CHECK: %[[EMPTY_ACC:.*]] = tensor.empty() : tensor<33x64x16x4xf32>
// CHECK: %[[PACK_ACC:.*]] = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 4] into %[[EMPTY_ACC]] : tensor<255x513xf32> -> tensor<33x64x16x4xf32>
// CHECK: %[[EXPAND_ACC:.*]] = tensor.expand_shape %[[PACK_ACC]]
// CHECK: %[[EMPTY_ACC2:.*]] = tensor.empty() : tensor<33x64x4x16x1x1xf32>
// CHECK: %[[TRANSPOSE_ACC:.*]] = linalg.transpose ins(%[[EXPAND_ACC]] : tensor<33x64x16x1x4x1xf32>) outs(%[[EMPTY_ACC2]] : tensor<33x64x4x16x1x1xf32>) permutation = [0, 1, 4, 2, 5, 3]
// CHECK: %[[COLLAPSE_RHS:.*]] = tensor.collapse_shape %[[TRANSPOSE_ACC]]
// CHECK: %[[EXPAND_ACC_2:.*]] = tensor.expand_shape %[[COLLAPSE_RHS]]
// CHECK: flow.dispatch.tensor.store %[[EXPAND_ACC_2]]
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ iree_lit_test_suite(
"config_winograd.mlir",
"extract_address_computation_gpu.mlir",
"gpu_set_num_workgroups.mlir",
"gpu_materialize_encoding.mlir",
"gpu_pipeline_generalize_named_ops.mlir",
"nvvm_extract_address_computation.mlir",
"nvvm_pipeline_test.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ iree_lit_test_suite(
"distribute_to_thread.mlir"
"elementwise_pipeline.mlir"
"extract_address_computation_gpu.mlir"
"gpu_materialize_encoding.mlir"
"gpu_pipeline_generalize_named_ops.mlir"
"gpu_set_num_workgroups.mlir"
"illegal_configuration.mlir"
Expand Down
Loading
Loading