From 0c286eb40c054730cf880a5577e462c7cc605c7f Mon Sep 17 00:00:00 2001
From: hanhanW <hanhan0912@gmail.com>
Date: Thu, 15 Aug 2024 17:09:14 -0700
Subject: [PATCH 1/3] [GPU][DT] Switch to query MMA intrinsics from
 IREE::GPU::TargetAttr.

The revision also moves the lit test to `Common/GPU/test/`.

Signed-off-by: hanhanW <hanhan0912@gmail.com>
---
 .../Common/GPU/GPUMaterializeEncoding.cpp     | 101 ++++++++++++------
 .../Codegen/Common/GPU/test/BUILD.bazel       |   1 +
 .../Codegen/Common/GPU/test/CMakeLists.txt    |   1 +
 .../GPU}/test/gpu_materialize_encoding.mlir   |  34 ++++--
 .../compiler/Codegen/LLVMGPU/test/BUILD.bazel |   1 -
 .../Codegen/LLVMGPU/test/CMakeLists.txt       |   1 -
 6 files changed, 94 insertions(+), 45 deletions(-)
 rename compiler/src/iree/compiler/Codegen/{LLVMGPU => Common/GPU}/test/gpu_materialize_encoding.mlir (84%)
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
index fe74d62c832c..59ac6a04dbf0 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUMaterializeEncoding.cpp
@@ -6,6 +6,11 @@
 
 #include "iree/compiler/Codegen/Common/EncodingUtils.h"
 #include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUInterfaces.h"
+#include "iree/compiler/Codegen/Utils/GPUUtils.h"
+#include "iree/compiler/Dialect/Encoding/IR/EncodingDialect.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -35,14 +40,16 @@ static std::optional<TileMxNxK> getIntrinsicSize(TypeRange elementTypes) {
   return std::nullopt;
 }
 
-// TODO: Query the value from GPU attributes.
-// TODO: Define a struct with meaningful name for the pair.
-SmallVector<int64_t> getIntrinsicVectorSize(TypeRange elementTypes,
-                                            int64_t roleIdx) {
-  Type lhs = elementTypes[0];
-  Type rhs = elementTypes[1];
-  Type out = elementTypes[2];
-  if (lhs.isF32() && rhs.isF32() && out.isF32()) {
+/// Returns the corresponding native vector sizes defined by the `mma`
+/// intrinsic.
+static SmallVector<int64_t> getIntrinsicVectorSize(IREE::GPU::MMAAttr mma,
+                                                   int64_t roleIdx) {
+  if (mma.getIntrinsic().getValue() ==
+      IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x4_F32) {
+    // TODO: Query the value from GPU attributes.
+    if (roleIdx == 0 || roleIdx == 1) {
+      return {1, 1};
+    }
     if (roleIdx == 0 || roleIdx == 1) {
       return {1, 1};
     }
@@ -55,13 +62,11 @@ SmallVector<int64_t> getIntrinsicVectorSize(TypeRange elementTypes,
 
 // Given encoding's role index and element types, return the transpose
 // permutation used in GPU materialization.
-SmallVector<int64_t> getTransposePermutation(int64_t roleIdx,
-                                             TypeRange elementTypes) {
-  // For now, check that all types are f32:
-  Type lhs = elementTypes[0];
-  Type rhs = elementTypes[1];
-  Type out = elementTypes[2];
-  if (!lhs.isF32() || !rhs.isF32() || !out.isF32()) {
+static SmallVector<int64_t> getTransposePermutation(IREE::GPU::MMAAttr mma,
+                                                    int64_t roleIdx) {
+  // TODO: Support other intrinsics.
+  if (mma.getIntrinsic().getValue() !=
+      IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x4_F32) {
     return {};
   }
 
@@ -81,27 +86,33 @@ SmallVector<int64_t> getTransposePermutation(int64_t roleIdx,
   }
 }
 
-// TODO(hanchung): Pass an ExecutableTargetAttr attribute for the target
-// encoding. Here we assume that every mfma op is available.
-// TODO(hanchung): Handle wmma ops.
-static SmallVector<TileMxNxK> enumerateMatmulTileMxNxK(TypeRange elementTypes) {
+static std::optional<IREE::GPU::MMAAttr>
+enumerateMmaIntrinsic(TypeRange elementTypes, IREE::GPU::TargetAttr target) {
   assert(elementTypes.size() == 3);
   Type lhs = elementTypes[0];
   Type rhs = elementTypes[1];
   Type out = elementTypes[2];
-  if (lhs.isF32() && rhs.isF32() && out.isF32()) {
-    // TODO: Take subgroup_size into account, so we can have more unrolling.
-    // TODO: Take the bitwidth of load into account, so we can have correct
-    // unrolling factor for K-dimension.
-    return {TileMxNxK{16, 16, 4}}; // Aim to use mfma_f32_16x16x4_f32 intrinsic.
+  for (IREE::GPU::MMAAttr mma : target.getWgp().getMma()) {
+    IREE::GPU::MMAIntrinsic type = mma.getIntrinsic().getValue();
+    // TODO: Drop this once all intrinsics are supported.
+    if (type != IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x4_F32) {
+      continue;
+    }
+
+    auto [aType, bType, cType] = mma.getABCElementTypes();
+    if (lhs != aType || rhs != bType || out != cType) {
+      continue;
+    }
+    return mma;
   }
 
   // Fallback - no architecture-optimized tile size for this case.
-  return {};
+  return std::nullopt;
 }
 
 static FailureOr<MaterializeEncodingInfo>
-materializeEncodingForTarget(RankedTensorType tensorType) {
+materializeEncodingForTarget(RankedTensorType tensorType,
+                             IREE::HAL::ExecutableTargetAttr targetAttr) {
   auto encoding =
       dyn_cast_or_null<IREE::Encoding::EncodingAttr>(tensorType.getEncoding());
   if (!encoding) {
@@ -113,28 +124,31 @@ materializeEncodingForTarget(RankedTensorType tensorType) {
       cDims->n.size() > 1 || cDims->k.size() > 1) {
     return failure();
   }
+
   // Enumerate available tile shapes for the given encoding and target.
+  IREE::GPU::TargetAttr gpuTargetAttr = getGPUTargetAttr(targetAttr);
   auto elementTypes = llvm::to_vector(
       llvm::map_range(encoding.getElementTypes().getValue(), [](Attribute a) {
         return cast<TypeAttr>(a).getValue();
       }));
-  SmallVector<TileMxNxK> enumeratedTileMxNxK =
-      enumerateMatmulTileMxNxK(elementTypes);
-  if (enumeratedTileMxNxK.empty()) {
+  std::optional<IREE::GPU::MMAAttr> mma =
+      enumerateMmaIntrinsic(elementTypes, gpuTargetAttr);
+  if (!mma) {
     return failure();
   }
 
   // Map the matmul TileMxNxK to an actual tile shape for the tensor at hand,
   // based on its operand index in the matmul.
+  // TODO: Support unrolling.
   auto rank = tensorType.getRank();
-
-  auto encodingInfo =
-      getEncodingInfoForMatmul(encoding, rank, enumeratedTileMxNxK[0]);
+  TileMxNxK innerTile;
+  std::tie(innerTile.M, innerTile.N, innerTile.K) = mma->getMNKShape();
+  auto encodingInfo = getEncodingInfoForMatmul(encoding, rank, innerTile);
 
   // insert inner tile shapes and permutation info
   auto roleIdx = encoding.getOperandIndex().getInt();
-  auto intrinsicVectorSizes = getIntrinsicVectorSize(elementTypes, roleIdx);
-  auto permutation = getTransposePermutation(roleIdx, elementTypes);
+  auto intrinsicVectorSizes = getIntrinsicVectorSize(*mma, roleIdx);
+  auto permutation = getTransposePermutation(*mma, roleIdx);
   encodingInfo.innerTileShapes = intrinsicVectorSizes;
   encodingInfo.permutation = permutation;
   return encodingInfo;
@@ -146,6 +160,11 @@ struct GPUMaterializeDeviceEncodingPass final
           GPUMaterializeDeviceEncodingPass> {
   using GPUMaterializeDeviceEncodingPassBase::
       GPUMaterializeDeviceEncodingPassBase;
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<arith::ArithDialect, tensor::TensorDialect,
+                    linalg::LinalgDialect, IREE::Encoding::IREEEncodingDialect,
+                    IREE::GPU::IREEGPUDialect>();
+  }
   void runOnOperation() override;
 };
 
@@ -301,13 +320,25 @@ struct GPUSetEncodingOpLoweringConversion
 
 } // namespace
 
+// TODO(hanchung): Remove the wrapper after allowing the type converter to carry
+// the targetAttr. For now, follow what CPU is doing.
+static MaterializeEncodingFn
+getMaterializeEncodingFn(IREE::HAL::ExecutableTargetAttr targetAttr) {
+  return
+      [targetAttr](
+          RankedTensorType tensorType) -> FailureOr<MaterializeEncodingInfo> {
+        return materializeEncodingForTarget(tensorType, targetAttr);
+      };
+}
+
 void GPUMaterializeDeviceEncodingPass::runOnOperation() {
   MLIRContext *ctx = &getContext();
   FunctionOpInterface funcOp = getOperation();
+  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(funcOp);
   {
     RewritePatternSet patterns(ctx);
     MaterializeEncodingTypeConverter typeConverter(
-        materializeEncodingForTarget);
+        getMaterializeEncodingFn(targetAttr));
     MaterializeEncodingConversionTarget target(*funcOp.getContext());
     MaterializeEncodingValueFn materializeEncodingValueFn =
         [](RankedTensorType, OpBuilder,
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
index 14dbfda1848b..9a2b912d0a4a 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel
@@ -26,6 +26,7 @@ iree_lit_test_suite(
             "gpu_distribute_shared_memory.mlir",
             "gpu_generalize_named_ops.mlir",
             "gpu_lower_to_ukernels.mlir",
+            "gpu_materialize_encoding.mlir",
             "gpu_nested_layout_contract_amdgpu.mlir",
             "gpu_nested_layout_vector_distribution.mlir",
             "gpu_pipeline.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
index 9ccc268d0d79..7d1442abb131 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt
@@ -22,6 +22,7 @@ iree_lit_test_suite(
     "gpu_distribute_shared_memory.mlir"
     "gpu_generalize_named_ops.mlir"
     "gpu_lower_to_ukernels.mlir"
+    "gpu_materialize_encoding.mlir"
     "gpu_nested_layout_contract_amdgpu.mlir"
     "gpu_nested_layout_vector_distribution.mlir"
     "gpu_pipeline.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_materialize_encoding.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir
similarity index 84%
rename from compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_materialize_encoding.mlir
rename to compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir
index 7890dffed6c0..1e9e19c7fd41 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_materialize_encoding.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir
@@ -1,16 +1,34 @@
 // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-materialize-device-encoding))" --split-input-file %s | FileCheck %s
 
+//-----------------------------------------------------------------------------
+// 1. MFMA_F32_16x16x4_F32
+//-----------------------------------------------------------------------------
+
 #encoding = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [f32, f32, f32], original_type = tensor<255x513xf32>,
                                     user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>],
                                     round_dims_to = array<i64: 16, 16, 16>>
-
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {
+  iree.gpu.target = #iree_gpu.target<arch = "gfx942",
+    features = "",
+    wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8,
+    storage =  b64|b32|b16|b8,
+    subgroup =  shuffle|arithmetic,
+    dot =  dp4xi8toi32,
+    mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>],
+    subgroup_size_choices = [64],
+    max_workgroup_sizes = [1024, 1024, 1024],
+    max_thread_count_per_workgroup = 1024,
+    max_workgroup_memory_bytes = 65536>>
+}>
 #pipeline_layout = #hal.pipeline.layout<push_constants = 0, sets = [
   #hal.descriptor_set.layout<0, bindings = [
     #hal.descriptor_set.binding<0, storage_buffer>,
     #hal.descriptor_set.binding<1, storage_buffer>
   ]>
 ]>
-func.func @set_encoding_LHS() {
+func.func @set_encoding_LHS() attributes {
+  hal.executable.target = #executable_target_rocm_hsaco_fb
+} {
   %c0 = arith.constant 0 : index
   %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xf32>>
   %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xf32, #encoding>>
@@ -31,9 +49,9 @@ func.func @set_encoding_LHS() {
 // CHECK: %[[EXPAND_LHS_2:.*]] = tensor.expand_shape %[[COLLAPSE]]
 // CHECK: flow.dispatch.tensor.store %[[EXPAND_LHS_2]]
 
-//---------
-
-func.func @set_encoding_RHS() {
+func.func @set_encoding_RHS() attributes {
+  hal.executable.target = #executable_target_rocm_hsaco_fb
+} {
   %c0 = arith.constant 0 : index
   %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xf32>>
   %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xf32, #encoding>>
@@ -54,9 +72,9 @@ func.func @set_encoding_RHS() {
 // CHECK: %[[EXPAND_RHS_2:.*]] = tensor.expand_shape %[[COLLAPSE_RHS]]
 // CHECK: flow.dispatch.tensor.store %[[EXPAND_RHS_2]]
 
-//---------
-
-func.func @set_encoding_ACC() {
+func.func @set_encoding_ACC() attributes {
+  hal.executable.target = #executable_target_rocm_hsaco_fb
+} {
   %c0 = arith.constant 0 : index
   %0 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<255x513xf32>>
   %1 = hal.interface.binding.subspan layout(#pipeline_layout) set(0) binding(1) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<255x513xf32, #encoding>>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 76b085ee4fac..67a84393a918 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -37,7 +37,6 @@ iree_lit_test_suite(
             "config_winograd.mlir",
             "extract_address_computation_gpu.mlir",
             "gpu_set_num_workgroups.mlir",
-            "gpu_materialize_encoding.mlir",
             "gpu_pipeline_generalize_named_ops.mlir",
             "nvvm_extract_address_computation.mlir",
             "nvvm_pipeline_test.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index 89a32173ecb2..46366b4d9fa6 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -33,7 +33,6 @@ iree_lit_test_suite(
     "distribute_to_thread.mlir"
     "elementwise_pipeline.mlir"
     "extract_address_computation_gpu.mlir"
-    "gpu_materialize_encoding.mlir"
     "gpu_pipeline_generalize_named_ops.mlir"
     "gpu_set_num_workgroups.mlir"
     "illegal_configuration.mlir"

From f7fa4af3a7c8a74e9e8c17f3011487a05beea02c Mon Sep 17 00:00:00 2001
From: hanhanW <hanhan0912@gmail.com>
Date: Thu, 15 Aug 2024 17:16:13 -0700
Subject: [PATCH 2/3] [nfc] pad checks with spaces so the code starts on the
 same column.

Signed-off-by: hanhanW <hanhan0912@gmail.com>
---
 .../GPU/test/gpu_materialize_encoding.mlir    | 53 +++++++++----------
 1 file changed, 26 insertions(+), 27 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir
index 1e9e19c7fd41..821cac997f9d 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_materialize_encoding.mlir
@@ -39,15 +39,15 @@ func.func @set_encoding_LHS() attributes {
 }
 
 // CHECK-LABEL: func.func @set_encoding_LHS
-// CHECK: %[[EMPTY:.*]] = tensor.empty() : tensor<33x64x16x4xf32>
-// CHECK: %[[PACK:.*]] = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 4] into %[[EMPTY]] : tensor<255x513xf32> -> tensor<33x64x16x4xf32>
-// CHECK: %[[EXPAND_LHS:.*]] = tensor.expand_shape %[[PACK]]
-// CHECK-SAME: output_shape [33, 64, 16, 1, 4, 1] : tensor<33x64x16x4xf32> into tensor<33x64x16x1x4x1xf32>
-// CHECK: %[[EMPTY_LHS2:.*]] = tensor.empty() : tensor<33x64x4x16x1x1xf32>
-// CHECK: %[[TRANSPOSE:.*]] = linalg.transpose ins(%[[EXPAND_LHS]] : tensor<33x64x16x1x4x1xf32>) outs(%[[EMPTY_LHS2]] : tensor<33x64x4x16x1x1xf32>) permutation = [0, 1, 4, 2, 5, 3] 
-// CHECK: %[[COLLAPSE:.*]] = tensor.collapse_shape %[[TRANSPOSE]]
-// CHECK: %[[EXPAND_LHS_2:.*]] = tensor.expand_shape %[[COLLAPSE]]
-// CHECK: flow.dispatch.tensor.store %[[EXPAND_LHS_2]]
+// CHECK:         %[[EMPTY:.*]] = tensor.empty() : tensor<33x64x16x4xf32>
+// CHECK:         %[[PACK:.*]] = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 4] into %[[EMPTY]] : tensor<255x513xf32> -> tensor<33x64x16x4xf32>
+// CHECK:         %[[EXPAND_LHS:.*]] = tensor.expand_shape %[[PACK]]
+// CHECK-SAME:      output_shape [33, 64, 16, 1, 4, 1] : tensor<33x64x16x4xf32> into tensor<33x64x16x1x4x1xf32>
+// CHECK:         %[[EMPTY_LHS2:.*]] = tensor.empty() : tensor<33x64x4x16x1x1xf32>
+// CHECK:         %[[TRANSPOSE:.*]] = linalg.transpose ins(%[[EXPAND_LHS]] : tensor<33x64x16x1x4x1xf32>) outs(%[[EMPTY_LHS2]] : tensor<33x64x4x16x1x1xf32>) permutation = [0, 1, 4, 2, 5, 3]
+// CHECK:         %[[COLLAPSE:.*]] = tensor.collapse_shape %[[TRANSPOSE]]
+// CHECK:         %[[EXPAND_LHS_2:.*]] = tensor.expand_shape %[[COLLAPSE]]
+// CHECK:         flow.dispatch.tensor.store %[[EXPAND_LHS_2]]
 
 func.func @set_encoding_RHS() attributes {
   hal.executable.target = #executable_target_rocm_hsaco_fb
@@ -62,15 +62,15 @@ func.func @set_encoding_RHS() attributes {
 }
 
 // CHECK-LABEL: func.func @set_encoding_RHS
-// CHECK: %[[EMPTY_RHS:.*]] = tensor.empty() : tensor<33x64x16x4xf32>
-// CHECK: %[[PACK_RHS:.*]] = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 4] into %3 : tensor<255x513xf32> -> tensor<33x64x16x4xf32>
-// CHECK: %[[EXPAND_RHS:.*]] = tensor.expand_shape %[[PACK_RHS]]
-// CHECK-SAME: output_shape [33, 64, 16, 1, 4, 1] : tensor<33x64x16x4xf32> into tensor<33x64x16x1x4x1xf32>
-// CHECK: %[[EMPTY_RHS2:.*]] = tensor.empty() : tensor<33x64x4x16x1x1xf32>
-// CHECK: %[[TRANSPOSE_RHS:.*]] = linalg.transpose ins(%[[EXPAND_RHS]] : tensor<33x64x16x1x4x1xf32>) outs(%[[EMPTY_RHS2]] : tensor<33x64x4x16x1x1xf32>) permutation = [0, 1, 4, 2, 5, 3]
-// CHECK: %[[COLLAPSE_RHS:.*]] = tensor.collapse_shape %[[TRANSPOSE_RHS]]
-// CHECK: %[[EXPAND_RHS_2:.*]] = tensor.expand_shape %[[COLLAPSE_RHS]]
-// CHECK: flow.dispatch.tensor.store %[[EXPAND_RHS_2]]
+// CHECK:         %[[EMPTY_RHS:.*]] = tensor.empty() : tensor<33x64x16x4xf32>
+// CHECK:         %[[PACK_RHS:.*]] = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 4] into %3 : tensor<255x513xf32> -> tensor<33x64x16x4xf32>
+// CHECK:         %[[EXPAND_RHS:.*]] = tensor.expand_shape %[[PACK_RHS]]
+// CHECK-SAME:      output_shape [33, 64, 16, 1, 4, 1] : tensor<33x64x16x4xf32> into tensor<33x64x16x1x4x1xf32>
+// CHECK:         %[[EMPTY_RHS2:.*]] = tensor.empty() : tensor<33x64x4x16x1x1xf32>
+// CHECK:         %[[TRANSPOSE_RHS:.*]] = linalg.transpose ins(%[[EXPAND_RHS]] : tensor<33x64x16x1x4x1xf32>) outs(%[[EMPTY_RHS2]] : tensor<33x64x4x16x1x1xf32>) permutation = [0, 1, 4, 2, 5, 3]
+// CHECK:         %[[COLLAPSE_RHS:.*]] = tensor.collapse_shape %[[TRANSPOSE_RHS]]
+// CHECK:         %[[EXPAND_RHS_2:.*]] = tensor.expand_shape %[[COLLAPSE_RHS]]
+// CHECK:         flow.dispatch.tensor.store %[[EXPAND_RHS_2]]
 
 func.func @set_encoding_ACC() attributes {
   hal.executable.target = #executable_target_rocm_hsaco_fb
@@ -85,12 +85,11 @@ func.func @set_encoding_ACC() attributes {
 }
 
 // CHECK-LABEL: func.func @set_encoding_ACC
-// CHECK: %[[EMPTY_ACC:.*]] = tensor.empty() : tensor<33x64x16x4xf32>
-// CHECK: %[[PACK_ACC:.*]] = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 4] into %[[EMPTY_ACC]] : tensor<255x513xf32> -> tensor<33x64x16x4xf32>
-// CHECK: %[[EXPAND_ACC:.*]] = tensor.expand_shape %[[PACK_ACC]]
-// CHECK: %[[EMPTY_ACC2:.*]] = tensor.empty() : tensor<33x64x4x16x1x1xf32>
-// CHECK: %[[TRANSPOSE_ACC:.*]] = linalg.transpose ins(%[[EXPAND_ACC]] : tensor<33x64x16x1x4x1xf32>) outs(%[[EMPTY_ACC2]] : tensor<33x64x4x16x1x1xf32>) permutation = [0, 1, 4, 2, 5, 3] 
-
-// CHECK: %[[COLLAPSE_RHS:.*]] = tensor.collapse_shape %[[TRANSPOSE_ACC]]
-// CHECK: %[[EXPAND_ACC_2:.*]] = tensor.expand_shape %[[COLLAPSE_RHS]]
-// CHECK: flow.dispatch.tensor.store %[[EXPAND_ACC_2]]
+// CHECK:         %[[EMPTY_ACC:.*]] = tensor.empty() : tensor<33x64x16x4xf32>
+// CHECK:         %[[PACK_ACC:.*]] = tensor.pack %2 padding_value(%cst : f32) outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 4] into %[[EMPTY_ACC]] : tensor<255x513xf32> -> tensor<33x64x16x4xf32>
+// CHECK:         %[[EXPAND_ACC:.*]] = tensor.expand_shape %[[PACK_ACC]]
+// CHECK:         %[[EMPTY_ACC2:.*]] = tensor.empty() : tensor<33x64x4x16x1x1xf32>
+// CHECK:         %[[TRANSPOSE_ACC:.*]] = linalg.transpose ins(%[[EXPAND_ACC]] : tensor<33x64x16x1x4x1xf32>) outs(%[[EMPTY_ACC2]] : tensor<33x64x4x16x1x1xf32>) permutation = [0, 1, 4, 2, 5, 3]
+// CHECK:         %[[COLLAPSE_RHS:.*]] = tensor.collapse_shape %[[TRANSPOSE_ACC]]
+// CHECK:         %[[EXPAND_ACC_2:.*]] = tensor.expand_shape %[[COLLAPSE_RHS]]
+// CHECK:         flow.dispatch.tensor.store %[[EXPAND_ACC_2]]

From e27d7835aacebd32d2f8f7c1e80c786f0c3ff9fa Mon Sep 17 00:00:00 2001
From: hanhanW <hanhan0912@gmail.com>
Date: Thu, 15 Aug 2024 17:40:59 -0700
Subject: [PATCH 3/3] fix bazel deps

Signed-off-by: hanhanW <hanhan0912@gmail.com>
---
 compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel    | 1 +
 compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt | 1 +
 2 files changed, 2 insertions(+)

diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
index bfc0ee207569..65e9b6e7ccc0 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel
@@ -96,6 +96,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Transforms",
         "//compiler/src/iree/compiler/Codegen/Utils",
         "//compiler/src/iree/compiler/Codegen/Utils:VectorOpUtils",
+        "//compiler/src/iree/compiler/Dialect/Encoding/IR",
         "//compiler/src/iree/compiler/Dialect/HAL/IR",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AMDGPUDialect",
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
index c3b265e6f804..3c29b055541c 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt
@@ -126,6 +126,7 @@ iree_cc_library(
     iree::compiler::Codegen::Transforms
     iree::compiler::Codegen::Utils
     iree::compiler::Codegen::Utils::VectorOpUtils
+    iree::compiler::Dialect::Encoding::IR
     iree::compiler::Dialect::HAL::IR
   PUBLIC
 )