Create an e2e DoubleTilingExpert pass pipeline for CPU gemms. (#8118)

- Add a DoubleTilingExpert pipline which uses sandbox codegen driver approaches. - Add a struct that mirrors all the options specified for the LinalgVectorLowering in codegen. This allows the pass pipeline to control these options. - This enables vectorization for other cases that the dim sizes are not multiples of tile sizes. - Verified that the final LLVM IRs are almost identical for some matmul cases, but there still are performance gaps. Thus, the option is not on by default. We might have to re-pick L1 tile sizes because they are larger than workgroup sizes in IREE, which is always a one-trip loop.
iree-org · Jan 15, 2022 · ff1149c · ff1149c
1 parent 1448346
commit ff1149c
Show file tree

Hide file tree

Showing 12 changed files with 233 additions and 23 deletions.
diff --git a/iree/compiler/Codegen/Dialect/LoweringConfig.td b/iree/compiler/Codegen/Dialect/LoweringConfig.td
@@ -14,6 +14,8 @@ def CPU_Default
     : StrEnumAttrCase<"CPUDefault">;
 def CPU_SingleTilingExpert
     : StrEnumAttrCase<"CPUSingleTilingExpert">;
+def CPU_DoubleTilingExpert
+    : StrEnumAttrCase<"CPUDoubleTilingExpert">;
 def CPU_TensorToVectors
     : StrEnumAttrCase<"CPUTensorToVectors">;
 def CPU_TileFuseAndVectorize
@@ -45,11 +47,11 @@ def None
 def DispatchLoweringPassPipelineEnum : StrEnumAttr<
     "DispatchLoweringPassPipeline",
     "identifier for pass pipeline use to lower dispatch region",
-    [CPU_Default, CPU_SingleTilingExpert, CPU_TensorToVectors,
-     CPU_TileFuseAndVectorize, LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize,
-     LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore, SPIRV_Distribute,
-     SPIRV_DistributeCopy, SPIRV_Vectorize, SPIRV_VectorizeToCooperativeOps,
-     None]> {
+    [CPU_Default, CPU_SingleTilingExpert, CPU_DoubleTilingExpert,
+     CPU_TensorToVectors, CPU_TileFuseAndVectorize, LLVMGPU_SimpleDistribute,
+     LLVMGPU_Vectorize, LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore,
+     SPIRV_Distribute, SPIRV_DistributeCopy, SPIRV_Vectorize,
+     SPIRV_VectorizeToCooperativeOps, None]> {
   let cppNamespace = "::mlir::iree_compiler::IREE::Codegen";
 }
 

diff --git a/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
@@ -63,6 +63,13 @@ static llvm::cl::opt<int> defaultWorkgroupTileSize(
         "linalg.generic and linalg.indexed_generic workgroup tile size"),
     llvm::cl::init(64));
 
+// TODO(hanchung): Enable the flag by default after addressing perf
+// regresssions.
+static llvm::cl::opt<bool> useDoubleTilingExpert(
+    "iree-codegen-use-double-tiling-expert",
+    llvm::cl::desc("DEVELOPMENT ONLY, DO NOT USE THE FLAG."),
+    llvm::cl::init(false));
+
 using IREE::Codegen::DispatchLoweringPassPipeline;
 
 static bool isVMVX(FuncOp entryPointFn) {
@@ -313,6 +320,39 @@ static LogicalResult setX86RootConfig(FuncOp entryPointFn,
   return success();
 }
 
+static LogicalResult setX86SandboxRootConfig(
+    FuncOp entryPointFn, linalg::ContractionOpInterface op,
+    SmallVector<int64_t> workloadPerWorkgroup, int vectorSize) {
+  setTranslationInfo(entryPointFn,
+                     DispatchLoweringPassPipeline::CPUDoubleTilingExpert,
+                     workloadPerWorkgroup,
+                     /*workgroupSize=*/ArrayRef<int64_t>{});
+
+  // Hardcoded tile sizes. The configuration is derived from iree-llvm-sandbox.
+  // L1 tile sizes are {1, 1, ..., 288, 128, 512}.
+  // Vector tile sizes are {1, ..., 9, 32, 16}
+  SmallVector<int64_t> l1TileSizes, vectorTileSizes;
+  int64_t nLoops = cast<linalg::LinalgOp>(op.getOperation()).getNumLoops();
+  l1TileSizes.append(nLoops - 3, 1);
+  l1TileSizes.push_back(288);
+  l1TileSizes.push_back(128);
+  l1TileSizes.push_back(512);
+  vectorTileSizes.append(nLoops - 3, 1);
+  vectorTileSizes.push_back(9);
+  vectorTileSizes.push_back(32);
+  vectorTileSizes.push_back(16);
+
+  TileSizesListType tileSizes;
+  tileSizes.push_back({});
+  tileSizes.push_back(l1TileSizes);
+  tileSizes.push_back(vectorTileSizes);
+  auto config = IREE::Codegen::LoweringConfigAttr::get(
+      entryPointFn.getContext(), tileSizes, vectorTileSizes);
+  setLoweringConfig(op, config);
+
+  return success();
+}
+
 static LogicalResult setARMRootConfig(FuncOp entryPointFn,
                                       linalg::ContractionOpInterface op,
                                       SmallVector<int64_t> workloadPerWorkgroup,
@@ -389,8 +429,18 @@ static LogicalResult setRootConfig(
 
   Optional<llvm::Triple> triple = getTargetTriple(entryPointFn);
   if (triple && triple.getValue().isX86()) {
-    return setX86RootConfig(entryPointFn, contractionOp, workloadPerWorkgroup,
-                            vectorSize);
+    // For DoubleTilingExpert, we will use LinalgSingleTilingExpertPassOptions
+    // to control transforms. There is a tileInterchange option that needs to be
+    // configured. However, we don't know the number of loops when adding the
+    // pass to pass manager. Thus, we don't use double tiling expert for batch
+    // gemms for now.
+    if (!numBatchDims && useDoubleTilingExpert) {
+      return setX86SandboxRootConfig(entryPointFn, contractionOp,
+                                     workloadPerWorkgroup, vectorSize);
+    } else {
+      return setX86RootConfig(entryPointFn, contractionOp, workloadPerWorkgroup,
+                              vectorSize);
+    }
   }
   // Fall back to ARM configurations.
   return setARMRootConfig(entryPointFn, contractionOp, workloadPerWorkgroup,

diff --git a/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp b/iree/compiler/Codegen/LLVMCPU/LLVMCPULowerExecutableTarget.cpp
@@ -181,6 +181,12 @@ void LLVMCPULowerExecutableTargetPass::runOnOperation() {
                 createConvertToDestinationPassingStylePass());
             addSingleTilingExpertPassPipeline(nestedModulePM);
             break;
+          case IREE::Codegen::DispatchLoweringPassPipeline::
+              CPUDoubleTilingExpert:
+            nestedModulePM.addNestedPass<FuncOp>(
+                createConvertToDestinationPassingStylePass());
+            addDoubleTilingExpertPassPipeline(nestedModulePM);
+            break;
           case IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors:
             addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors);
             break;

diff --git a/iree/compiler/Codegen/LLVMCPU/Passes.cpp b/iree/compiler/Codegen/LLVMCPU/Passes.cpp
@@ -180,11 +180,13 @@ void addTensorToVectorsPassPipeline(OpPassManager &passManager,
 void addSingleTilingExpertPassPipeline(OpPassManager &passManager) {
   passManager.addPass(createCanonicalizerPass());
   // Add the sandbox single tiling expert to tile and vectorize.
-  LinalgSingleTilingExpertPassOptions options;
-  options.vectorize = true;
-  options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
-  passManager.addNestedPass<FuncOp>(
-      createLinalgSingleTilingExpertPass(options));
+  {
+    LinalgSingleTilingExpertPassOptions options;
+    options.vectorize = true;
+    options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
+    passManager.addNestedPass<FuncOp>(
+        createLinalgSingleTilingExpertPass(options));
+  }
 
   // TODO(ravishankarm): This is commented cause this is WIP, to be enabled
   // soon.
@@ -197,8 +199,60 @@ void addSingleTilingExpertPassPipeline(OpPassManager &passManager) {
   addLinalgBufferizePasses(passManager, cpuAllocationFunction);
 
   // Add the vector lowering expert.
-  OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
-  addLowerToVectorTransforms(nestedFuncPassManager);
+  {
+    OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
+    LinalgVectorLoweringPassOptions options;
+    addLowerToVectorTransforms(nestedFuncPassManager, options);
+  }
+}
+
+void addDoubleTilingExpertPassPipeline(OpPassManager &passManager) {
+  passManager.addPass(createCanonicalizerPass());
+  {
+    passManager.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
+    LinalgSingleTilingExpertPassOptions options;
+    options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
+    options.tileInterchange = {0, 2, 1};
+    passManager.addNestedPass<FuncOp>(
+        createLinalgSingleTilingExpertPass(options));
+    passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
+    passManager.addNestedPass<FuncOp>(createCSEPass());
+  }
+
+  // Add the sandbox single tiling expert to tile and vectorize.
+  {
+    // The options are derived from sandbox codegen driver. hoistPadding options
+    // does not work in IREE cases. It's fine to not have it, since it's already
+    // generating the IR as same as sandbox.
+    LinalgSingleTilingExpertPassOptions options;
+    options.vectorize = true;
+    options.vectorizePadding = true;
+    options.pad = true;
+    options.packPaddings = {1, 1, 0};
+    // options.hoistPaddings = {5, 6, 0};
+    options.tilingLevel = static_cast<int64_t>(TilingLevel::VectorTiles);
+    options.tileInterchange = {0, 1, 2};
+    passManager.addNestedPass<FuncOp>(
+        createLinalgSingleTilingExpertPass(options));
+  }
+
+  // TODO(ravishankarm): This is commented cause this is WIP, to be enabled
+  // soon.
+  // auto callbacks =
+  //     std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
+  //         cpuComprehensiveBufferizeAllocationFn,
+  //         cpuComprehensiveBufferizeDeallocationFn,
+  //         cpuComprehensiveBufferizeCopyFn);
+  // addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));
+  addLinalgBufferizePasses(passManager, cpuAllocationFunction);
+
+  // Add the vector lowering expert.
+  {
+    OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
+    LinalgVectorLoweringPassOptions options;
+    options.splitVectorTransfersTo = "linalg-copy";
+    addLowerToVectorTransforms(nestedFuncPassManager, options);
+  }
 }
 
 void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,

diff --git a/iree/compiler/Codegen/LLVMCPU/test/BUILD b/iree/compiler/Codegen/LLVMCPU/test/BUILD
@@ -24,6 +24,7 @@ iree_lit_test_suite(
             "hal_interface_constants.mlir",
             "hal_interface_workgroup_info.mlir",
             "illegal_configuration.mlir",
+            "materialize_double_tiling_expert_configuration.mlir",
             "materialize_launch_configuration.mlir",
             "synchronize_symbol_visibility.mlir",
             "test_config_mmt4d.mlir",

diff --git a/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt b/iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
@@ -19,6 +19,7 @@ iree_lit_test_suite(
     "hal_interface_constants.mlir"
     "hal_interface_workgroup_info.mlir"
     "illegal_configuration.mlir"
+    "materialize_double_tiling_expert_configuration.mlir"
     "materialize_launch_configuration.mlir"
     "synchronize_symbol_visibility.mlir"
     "test_config_mmt4d.mlir"

diff --git a/iree/compiler/Codegen/LLVMCPU/test/materialize_double_tiling_expert_configuration.mlir b/iree/compiler/Codegen/LLVMCPU/test/materialize_double_tiling_expert_configuration.mlir
@@ -0,0 +1,60 @@
+// RUN: iree-opt --iree-codegen-use-double-tiling-expert -pass-pipeline='hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target{test-lowering-configuration=true}))' -cse -canonicalize -split-input-file %s | FileCheck %s
+
+#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
+  #hal.descriptor_set.layout<0, bindings = [
+    #hal.descriptor_set.binding<0, storage_buffer>,
+    #hal.descriptor_set.binding<1, storage_buffer>,
+    #hal.descriptor_set.binding<2, storage_buffer>
+  ]>
+]>
+hal.executable private @matmul_x86  {
+  hal.executable.variant public @embedded_elf_x86_64, target = #hal.executable.target<
+    "llvm",
+    "embedded-elf-x86_64", {
+      data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
+      native_vector_size = 16 : index,
+      target_triple = "x86_64-unknown-unknown-eabi-elf"
+    }> {
+    hal.executable.entry_point public @matmul_x86 layout(#executable_layout)
+    builtin.module {
+      func @matmul_x86() {
+        %c128 = arith.constant 128 : index
+        %c384 = arith.constant 384 : index
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:384x512xf32>
+        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:512x128xf32>
+        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:384x128xf32>
+        %workgroup_size_x = hal.interface.workgroup.size[0] : index
+        %workgroup_size_y = hal.interface.workgroup.size[1] : index
+        %workgroup_id_x = hal.interface.workgroup.id[0] : index
+        %workgroup_count_x = hal.interface.workgroup.count[0] : index
+        %workgroup_id_y = hal.interface.workgroup.id[1] : index
+        %workgroup_count_y = hal.interface.workgroup.count[1] : index
+        %3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
+        %4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
+        scf.for %arg0 = %3 to %c384 step %4 {
+          %5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
+          %6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
+          scf.for %arg1 = %5 to %c128 step %6 {
+            %7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 384)>(%arg0)[%workgroup_size_y]
+            %8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:384x512xf32> -> tensor<?x512xf32>
+            %9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 128)>(%arg1)[%workgroup_size_x]
+            %10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [512, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:512x128xf32> -> tensor<512x?xf32>
+            %11 = affine.min affine_map<(d0)[s0] -> (-d0 + 384, s0)>(%arg0)[%workgroup_size_y]
+            %12 = affine.min affine_map<(d0)[s0] -> (-d0 + 128, s0)>(%arg1)[%workgroup_size_x]
+            %13 = linalg.init_tensor [%11, %12] : tensor<?x?xf32>
+            %14 = linalg.fill(%cst, %13) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+            %15 = linalg.matmul ins(%8, %10 : tensor<?x512xf32>, tensor<512x?xf32>) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32>
+            flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:384x128xf32>
+          }
+        }
+        return
+      }
+    }
+  }
+}
+
+//  CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUDoubleTilingExpert", workload_per_wg = [64, 64]>
+//  CHECK-DAG: #[[CONFIG:.+]] =  #iree_codegen.lowering.config<tile_sizes = [{{\[}}], [288, 128, 512], [9, 32, 16]], native_vector_size = [9, 32, 16]>
+//  CHECK:       linalg.matmul {lowering.config = #[[CONFIG]]}
diff --git a/iree/compiler/Codegen/Passes.h b/iree/compiler/Codegen/Passes.h
@@ -209,6 +209,10 @@ void addTensorToVectorsPassPipeline(OpPassManager &passManager,
 /// using the Codegen drivers from sandbox.
 void addSingleTilingExpertPassPipeline(OpPassManager &passManager);
 
+/// Populates the passes needed to do two-level tile + vectorize of linalg ops
+/// using the Codegen drivers from sandbox.
+void addDoubleTilingExpertPassPipeline(OpPassManager &passManager);
+
 /// Populates the passes needed to multi level tile, fuse and vectorize lowering
 /// of linalg ops on tensors to vectors operations.
 void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,

diff --git a/iree/compiler/Codegen/Sandbox/BUILD b/iree/compiler/Codegen/Sandbox/BUILD
@@ -62,6 +62,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorOps",
         "@llvm-project//mlir:X86VectorTransforms",
     ],

diff --git a/iree/compiler/Codegen/Sandbox/CMakeLists.txt b/iree/compiler/Codegen/Sandbox/CMakeLists.txt
@@ -53,6 +53,7 @@ iree_cc_library(
     MLIRPass
     MLIRSCF
     MLIRTensor
+    MLIRTransforms
     MLIRVector
     MLIRX86VectorTransforms
     iree::compiler::Codegen::Dialect::IREECodegenDialect

diff --git a/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp b/iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Vector/VectorTransforms.h"
 #include "mlir/Dialect/X86Vector/Transforms.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
 
 using namespace mlir;
 using namespace mlir::linalg;
@@ -121,6 +122,16 @@ struct LinalgVectorLoweringPass
     this->vectorLoweringStage.setValue(vectorLoweringStage);
   }
   LinalgVectorLoweringPass(const LinalgVectorLoweringPass &pass) {}
+  LinalgVectorLoweringPass(const LinalgVectorLoweringPassOptions &options) {
+    this->vectorLoweringStage = options.vectorLoweringStage;
+    this->splitVectorTransfersTo = options.splitVectorTransfersTo;
+    this->lowerVectorTransposeTo = options.lowerVectorTransposeTo;
+    this->lowerVectorTransposeToAVX2 = options.lowerVectorTransposeToAVX2;
+    this->lowerVectorMultiReductionTo = options.lowerVectorMultiReductionTo;
+    this->lowerVectorContractionTo = options.lowerVectorContractionTo;
+    this->unrollVectorTransfers = options.unrollVectorTransfers;
+    this->maxTransferRank = options.maxTransferRank;
+  }
 
   void runOnOperation() override;
 };
@@ -346,19 +357,23 @@ std::unique_ptr<OperationPass<FuncOp>> mlir::createLinalgVectorLoweringPass(
     int64_t vectorLoweringStage) {
   return std::make_unique<LinalgVectorLoweringPass>(vectorLoweringStage);
 }
+std::unique_ptr<OperationPass<FuncOp>> mlir::createLinalgVectorLoweringPass(
+    const LinalgVectorLoweringPassOptions &options) {
+  return std::make_unique<LinalgVectorLoweringPass>(options);
+}
 
 //===----------------------------------------------------------------------===//
 // Transforms
 //===----------------------------------------------------------------------===//
 
-void mlir::addLowerToVectorTransforms(OpPassManager &passManager) {
-  passManager.addPass(createLinalgVectorLoweringPass(0));
-  passManager.addPass(createLinalgVectorLoweringPass(1));
-  passManager.addPass(createLinalgVectorLoweringPass(2));
-  passManager.addPass(createLinalgVectorLoweringPass(3));
-  passManager.addPass(createLinalgVectorLoweringPass(4));
-  passManager.addPass(createLinalgVectorLoweringPass(5));
-  passManager.addPass(createLinalgVectorLoweringPass(6));
+void mlir::addLowerToVectorTransforms(OpPassManager &passManager,
+                                      LinalgVectorLoweringPassOptions options) {
+  for (int i = 0; i < 7; ++i) {
+    options.vectorLoweringStage = i;
+    passManager.addPass(createLinalgVectorLoweringPass(options));
+    passManager.addPass(createCanonicalizerPass());
+    passManager.addPass(createCSEPass());
+  }
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/iree/compiler/Codegen/Sandbox/Passes.h b/iree/compiler/Codegen/Sandbox/Passes.h
@@ -38,18 +38,33 @@ std::unique_ptr<OperationPass<FuncOp>> createLinalgSingleTilingExpertPass();
 std::unique_ptr<OperationPass<FuncOp>> createLinalgSingleTilingExpertPass(
     const LinalgSingleTilingExpertPassOptions &passOptions);
 
+/// Struct to control pass options for `LinalgVectorLoweringPass` pass.
+struct LinalgVectorLoweringPassOptions {
+  int vectorLoweringStage = 0;
+  std::string splitVectorTransfersTo = "";
+  std::string lowerVectorTransposeTo = "eltwise";
+  bool lowerVectorTransposeToAVX2 = false;
+  std::string lowerVectorMultiReductionTo = "innerparallel";
+  std::string lowerVectorContractionTo = "outerproduct";
+  bool unrollVectorTransfers = true;
+  int maxTransferRank = 1;
+};
+
 /// Creates a pass to drive the lowering of vector operations in a staged
 /// manner.
 std::unique_ptr<OperationPass<FuncOp>> createLinalgVectorLoweringPass(
     int64_t vectorLoweringStage = 0);
+std::unique_ptr<OperationPass<FuncOp>> createLinalgVectorLoweringPass(
+    const LinalgVectorLoweringPassOptions &options);
 
 //===----------------------------------------------------------------------===//
 // Transforms that tie together individual drivers.
 //===----------------------------------------------------------------------===//
 
 /// Add staged lowering of vector ops. `passManager` is expected to be a
 /// `builtin.func` op pass manager.
-void addLowerToVectorTransforms(OpPassManager &passManager);
+void addLowerToVectorTransforms(OpPassManager &passManager,
+                                LinalgVectorLoweringPassOptions options);
 
 //===----------------------------------------------------------------------===//
 // IREE specific pass creation methods to allow invocation from within IREEs