Skip to content

Commit

Permalink
Create an e2e DoubleTilingExpert pass pipeline for CPU gemms. (#8118)
Browse files Browse the repository at this point in the history
- Add a DoubleTilingExpert pipline which uses sandbox codegen driver
  approaches.
- Add a struct that mirrors all the options specified for the
  LinalgVectorLowering in codegen. This allows the pass pipeline
  to control these options.
- This enables vectorization for other cases that the dim sizes are not
  multiples of tile sizes.
- Verified that the final LLVM IRs are almost identical for some matmul
  cases, but there still are performance gaps. Thus, the option is not
  on by default. We might have to re-pick L1 tile sizes because they are
  larger than workgroup sizes in IREE, which is always a one-trip loop.
  • Loading branch information
hanhanW authored Jan 15, 2022
1 parent 1448346 commit ff1149c
Show file tree
Hide file tree
Showing 12 changed files with 233 additions and 23 deletions.
12 changes: 7 additions & 5 deletions iree/compiler/Codegen/Dialect/LoweringConfig.td
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ def CPU_Default
: StrEnumAttrCase<"CPUDefault">;
def CPU_SingleTilingExpert
: StrEnumAttrCase<"CPUSingleTilingExpert">;
def CPU_DoubleTilingExpert
: StrEnumAttrCase<"CPUDoubleTilingExpert">;
def CPU_TensorToVectors
: StrEnumAttrCase<"CPUTensorToVectors">;
def CPU_TileFuseAndVectorize
Expand Down Expand Up @@ -45,11 +47,11 @@ def None
def DispatchLoweringPassPipelineEnum : StrEnumAttr<
"DispatchLoweringPassPipeline",
"identifier for pass pipeline use to lower dispatch region",
[CPU_Default, CPU_SingleTilingExpert, CPU_TensorToVectors,
CPU_TileFuseAndVectorize, LLVMGPU_SimpleDistribute, LLVMGPU_Vectorize,
LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore, SPIRV_Distribute,
SPIRV_DistributeCopy, SPIRV_Vectorize, SPIRV_VectorizeToCooperativeOps,
None]> {
[CPU_Default, CPU_SingleTilingExpert, CPU_DoubleTilingExpert,
CPU_TensorToVectors, CPU_TileFuseAndVectorize, LLVMGPU_SimpleDistribute,
LLVMGPU_Vectorize, LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore,
SPIRV_Distribute, SPIRV_DistributeCopy, SPIRV_Vectorize,
SPIRV_VectorizeToCooperativeOps, None]> {
let cppNamespace = "::mlir::iree_compiler::IREE::Codegen";
}

Expand Down
54 changes: 52 additions & 2 deletions iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,13 @@ static llvm::cl::opt<int> defaultWorkgroupTileSize(
"linalg.generic and linalg.indexed_generic workgroup tile size"),
llvm::cl::init(64));

// TODO(hanchung): Enable the flag by default after addressing perf
// regresssions.
static llvm::cl::opt<bool> useDoubleTilingExpert(
"iree-codegen-use-double-tiling-expert",
llvm::cl::desc("DEVELOPMENT ONLY, DO NOT USE THE FLAG."),
llvm::cl::init(false));

using IREE::Codegen::DispatchLoweringPassPipeline;

static bool isVMVX(FuncOp entryPointFn) {
Expand Down Expand Up @@ -313,6 +320,39 @@ static LogicalResult setX86RootConfig(FuncOp entryPointFn,
return success();
}

static LogicalResult setX86SandboxRootConfig(
FuncOp entryPointFn, linalg::ContractionOpInterface op,
SmallVector<int64_t> workloadPerWorkgroup, int vectorSize) {
setTranslationInfo(entryPointFn,
DispatchLoweringPassPipeline::CPUDoubleTilingExpert,
workloadPerWorkgroup,
/*workgroupSize=*/ArrayRef<int64_t>{});

// Hardcoded tile sizes. The configuration is derived from iree-llvm-sandbox.
// L1 tile sizes are {1, 1, ..., 288, 128, 512}.
// Vector tile sizes are {1, ..., 9, 32, 16}
SmallVector<int64_t> l1TileSizes, vectorTileSizes;
int64_t nLoops = cast<linalg::LinalgOp>(op.getOperation()).getNumLoops();
l1TileSizes.append(nLoops - 3, 1);
l1TileSizes.push_back(288);
l1TileSizes.push_back(128);
l1TileSizes.push_back(512);
vectorTileSizes.append(nLoops - 3, 1);
vectorTileSizes.push_back(9);
vectorTileSizes.push_back(32);
vectorTileSizes.push_back(16);

TileSizesListType tileSizes;
tileSizes.push_back({});
tileSizes.push_back(l1TileSizes);
tileSizes.push_back(vectorTileSizes);
auto config = IREE::Codegen::LoweringConfigAttr::get(
entryPointFn.getContext(), tileSizes, vectorTileSizes);
setLoweringConfig(op, config);

return success();
}

static LogicalResult setARMRootConfig(FuncOp entryPointFn,
linalg::ContractionOpInterface op,
SmallVector<int64_t> workloadPerWorkgroup,
Expand Down Expand Up @@ -389,8 +429,18 @@ static LogicalResult setRootConfig(

Optional<llvm::Triple> triple = getTargetTriple(entryPointFn);
if (triple && triple.getValue().isX86()) {
return setX86RootConfig(entryPointFn, contractionOp, workloadPerWorkgroup,
vectorSize);
// For DoubleTilingExpert, we will use LinalgSingleTilingExpertPassOptions
// to control transforms. There is a tileInterchange option that needs to be
// configured. However, we don't know the number of loops when adding the
// pass to pass manager. Thus, we don't use double tiling expert for batch
// gemms for now.
if (!numBatchDims && useDoubleTilingExpert) {
return setX86SandboxRootConfig(entryPointFn, contractionOp,
workloadPerWorkgroup, vectorSize);
} else {
return setX86RootConfig(entryPointFn, contractionOp, workloadPerWorkgroup,
vectorSize);
}
}
// Fall back to ARM configurations.
return setARMRootConfig(entryPointFn, contractionOp, workloadPerWorkgroup,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,12 @@ void LLVMCPULowerExecutableTargetPass::runOnOperation() {
createConvertToDestinationPassingStylePass());
addSingleTilingExpertPassPipeline(nestedModulePM);
break;
case IREE::Codegen::DispatchLoweringPassPipeline::
CPUDoubleTilingExpert:
nestedModulePM.addNestedPass<FuncOp>(
createConvertToDestinationPassingStylePass());
addDoubleTilingExpertPassPipeline(nestedModulePM);
break;
case IREE::Codegen::DispatchLoweringPassPipeline::CPUTensorToVectors:
addTensorToVectorsPassPipeline(nestedModulePM, lowerToVectors);
break;
Expand Down
68 changes: 61 additions & 7 deletions iree/compiler/Codegen/LLVMCPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,13 @@ void addTensorToVectorsPassPipeline(OpPassManager &passManager,
void addSingleTilingExpertPassPipeline(OpPassManager &passManager) {
passManager.addPass(createCanonicalizerPass());
// Add the sandbox single tiling expert to tile and vectorize.
LinalgSingleTilingExpertPassOptions options;
options.vectorize = true;
options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
passManager.addNestedPass<FuncOp>(
createLinalgSingleTilingExpertPass(options));
{
LinalgSingleTilingExpertPassOptions options;
options.vectorize = true;
options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
passManager.addNestedPass<FuncOp>(
createLinalgSingleTilingExpertPass(options));
}

// TODO(ravishankarm): This is commented cause this is WIP, to be enabled
// soon.
Expand All @@ -197,8 +199,60 @@ void addSingleTilingExpertPassPipeline(OpPassManager &passManager) {
addLinalgBufferizePasses(passManager, cpuAllocationFunction);

// Add the vector lowering expert.
OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
addLowerToVectorTransforms(nestedFuncPassManager);
{
OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
LinalgVectorLoweringPassOptions options;
addLowerToVectorTransforms(nestedFuncPassManager, options);
}
}

void addDoubleTilingExpertPassPipeline(OpPassManager &passManager) {
passManager.addPass(createCanonicalizerPass());
{
passManager.addNestedPass<FuncOp>(createRemoveSingleIterationLoopPass());
LinalgSingleTilingExpertPassOptions options;
options.tilingLevel = static_cast<int64_t>(TilingLevel::L1Tiles);
options.tileInterchange = {0, 2, 1};
passManager.addNestedPass<FuncOp>(
createLinalgSingleTilingExpertPass(options));
passManager.addNestedPass<FuncOp>(createCanonicalizerPass());
passManager.addNestedPass<FuncOp>(createCSEPass());
}

// Add the sandbox single tiling expert to tile and vectorize.
{
// The options are derived from sandbox codegen driver. hoistPadding options
// does not work in IREE cases. It's fine to not have it, since it's already
// generating the IR as same as sandbox.
LinalgSingleTilingExpertPassOptions options;
options.vectorize = true;
options.vectorizePadding = true;
options.pad = true;
options.packPaddings = {1, 1, 0};
// options.hoistPaddings = {5, 6, 0};
options.tilingLevel = static_cast<int64_t>(TilingLevel::VectorTiles);
options.tileInterchange = {0, 1, 2};
passManager.addNestedPass<FuncOp>(
createLinalgSingleTilingExpertPass(options));
}

// TODO(ravishankarm): This is commented cause this is WIP, to be enabled
// soon.
// auto callbacks =
// std::make_unique<linalg::comprehensive_bufferize::AllocationCallbacks>(
// cpuComprehensiveBufferizeAllocationFn,
// cpuComprehensiveBufferizeDeallocationFn,
// cpuComprehensiveBufferizeCopyFn);
// addIREEComprehensiveBufferizePasses(passManager, std::move(callbacks));
addLinalgBufferizePasses(passManager, cpuAllocationFunction);

// Add the vector lowering expert.
{
OpPassManager &nestedFuncPassManager = passManager.nest<FuncOp>();
LinalgVectorLoweringPassOptions options;
options.splitVectorTransfersTo = "linalg-copy";
addLowerToVectorTransforms(nestedFuncPassManager, options);
}
}

void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,
Expand Down
1 change: 1 addition & 0 deletions iree/compiler/Codegen/LLVMCPU/test/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ iree_lit_test_suite(
"hal_interface_constants.mlir",
"hal_interface_workgroup_info.mlir",
"illegal_configuration.mlir",
"materialize_double_tiling_expert_configuration.mlir",
"materialize_launch_configuration.mlir",
"synchronize_symbol_visibility.mlir",
"test_config_mmt4d.mlir",
Expand Down
1 change: 1 addition & 0 deletions iree/compiler/Codegen/LLVMCPU/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ iree_lit_test_suite(
"hal_interface_constants.mlir"
"hal_interface_workgroup_info.mlir"
"illegal_configuration.mlir"
"materialize_double_tiling_expert_configuration.mlir"
"materialize_launch_configuration.mlir"
"synchronize_symbol_visibility.mlir"
"test_config_mmt4d.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// RUN: iree-opt --iree-codegen-use-double-tiling-expert -pass-pipeline='hal.executable(hal.executable.variant(iree-llvmcpu-lower-executable-target{test-lowering-configuration=true}))' -cse -canonicalize -split-input-file %s | FileCheck %s

#executable_layout = #hal.executable.layout<push_constants = 0, sets = [
#hal.descriptor_set.layout<0, bindings = [
#hal.descriptor_set.binding<0, storage_buffer>,
#hal.descriptor_set.binding<1, storage_buffer>,
#hal.descriptor_set.binding<2, storage_buffer>
]>
]>
hal.executable private @matmul_x86 {
hal.executable.variant public @embedded_elf_x86_64, target = #hal.executable.target<
"llvm",
"embedded-elf-x86_64", {
data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128",
native_vector_size = 16 : index,
target_triple = "x86_64-unknown-unknown-eabi-elf"
}> {
hal.executable.entry_point public @matmul_x86 layout(#executable_layout)
builtin.module {
func @matmul_x86() {
%c128 = arith.constant 128 : index
%c384 = arith.constant 384 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) : !flow.dispatch.tensor<readonly:384x512xf32>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) : !flow.dispatch.tensor<readonly:512x128xf32>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) : !flow.dispatch.tensor<writeonly:384x128xf32>
%workgroup_size_x = hal.interface.workgroup.size[0] : index
%workgroup_size_y = hal.interface.workgroup.size[1] : index
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%3 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_y, %workgroup_size_y]
%4 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_y, %workgroup_size_y]
scf.for %arg0 = %3 to %c384 step %4 {
%5 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_id_x, %workgroup_size_x]
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * s1)>()[%workgroup_count_x, %workgroup_size_x]
scf.for %arg1 = %5 to %c128 step %6 {
%7 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 384)>(%arg0)[%workgroup_size_y]
%8 = flow.dispatch.tensor.load %0, offsets = [%arg0, 0], sizes = [%7, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:384x512xf32> -> tensor<?x512xf32>
%9 = affine.min affine_map<(d0)[s0] -> (s0, -d0 + 128)>(%arg1)[%workgroup_size_x]
%10 = flow.dispatch.tensor.load %1, offsets = [0, %arg1], sizes = [512, %9], strides = [1, 1] : !flow.dispatch.tensor<readonly:512x128xf32> -> tensor<512x?xf32>
%11 = affine.min affine_map<(d0)[s0] -> (-d0 + 384, s0)>(%arg0)[%workgroup_size_y]
%12 = affine.min affine_map<(d0)[s0] -> (-d0 + 128, s0)>(%arg1)[%workgroup_size_x]
%13 = linalg.init_tensor [%11, %12] : tensor<?x?xf32>
%14 = linalg.fill(%cst, %13) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
%15 = linalg.matmul ins(%8, %10 : tensor<?x512xf32>, tensor<512x?xf32>) outs(%14 : tensor<?x?xf32>) -> tensor<?x?xf32>
flow.dispatch.tensor.store %15, %2, offsets = [%arg0, %arg1], sizes = [%7, %9], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:384x128xf32>
}
}
return
}
}
}
}

// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation.info<"CPUDoubleTilingExpert", workload_per_wg = [64, 64]>
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering.config<tile_sizes = [{{\[}}], [288, 128, 512], [9, 32, 16]], native_vector_size = [9, 32, 16]>
// CHECK: linalg.matmul {lowering.config = #[[CONFIG]]}
4 changes: 4 additions & 0 deletions iree/compiler/Codegen/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,10 @@ void addTensorToVectorsPassPipeline(OpPassManager &passManager,
/// using the Codegen drivers from sandbox.
void addSingleTilingExpertPassPipeline(OpPassManager &passManager);

/// Populates the passes needed to do two-level tile + vectorize of linalg ops
/// using the Codegen drivers from sandbox.
void addDoubleTilingExpertPassPipeline(OpPassManager &passManager);

/// Populates the passes needed to multi level tile, fuse and vectorize lowering
/// of linalg ops on tensors to vectors operations.
void addTileFuseAndVectorizePassPipeline(OpPassManager &passManager,
Expand Down
1 change: 1 addition & 0 deletions iree/compiler/Codegen/Sandbox/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ cc_library(
"@llvm-project//mlir:Pass",
"@llvm-project//mlir:SCFDialect",
"@llvm-project//mlir:TensorDialect",
"@llvm-project//mlir:Transforms",
"@llvm-project//mlir:VectorOps",
"@llvm-project//mlir:X86VectorTransforms",
],
Expand Down
1 change: 1 addition & 0 deletions iree/compiler/Codegen/Sandbox/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ iree_cc_library(
MLIRPass
MLIRSCF
MLIRTensor
MLIRTransforms
MLIRVector
MLIRX86VectorTransforms
iree::compiler::Codegen::Dialect::IREECodegenDialect
Expand Down
31 changes: 23 additions & 8 deletions iree/compiler/Codegen/Sandbox/LinalgTensorCodegenDriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "mlir/Dialect/Vector/VectorTransforms.h"
#include "mlir/Dialect/X86Vector/Transforms.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/Passes.h"

using namespace mlir;
using namespace mlir::linalg;
Expand Down Expand Up @@ -121,6 +122,16 @@ struct LinalgVectorLoweringPass
this->vectorLoweringStage.setValue(vectorLoweringStage);
}
LinalgVectorLoweringPass(const LinalgVectorLoweringPass &pass) {}
LinalgVectorLoweringPass(const LinalgVectorLoweringPassOptions &options) {
this->vectorLoweringStage = options.vectorLoweringStage;
this->splitVectorTransfersTo = options.splitVectorTransfersTo;
this->lowerVectorTransposeTo = options.lowerVectorTransposeTo;
this->lowerVectorTransposeToAVX2 = options.lowerVectorTransposeToAVX2;
this->lowerVectorMultiReductionTo = options.lowerVectorMultiReductionTo;
this->lowerVectorContractionTo = options.lowerVectorContractionTo;
this->unrollVectorTransfers = options.unrollVectorTransfers;
this->maxTransferRank = options.maxTransferRank;
}

void runOnOperation() override;
};
Expand Down Expand Up @@ -346,19 +357,23 @@ std::unique_ptr<OperationPass<FuncOp>> mlir::createLinalgVectorLoweringPass(
int64_t vectorLoweringStage) {
return std::make_unique<LinalgVectorLoweringPass>(vectorLoweringStage);
}
std::unique_ptr<OperationPass<FuncOp>> mlir::createLinalgVectorLoweringPass(
const LinalgVectorLoweringPassOptions &options) {
return std::make_unique<LinalgVectorLoweringPass>(options);
}

//===----------------------------------------------------------------------===//
// Transforms
//===----------------------------------------------------------------------===//

void mlir::addLowerToVectorTransforms(OpPassManager &passManager) {
passManager.addPass(createLinalgVectorLoweringPass(0));
passManager.addPass(createLinalgVectorLoweringPass(1));
passManager.addPass(createLinalgVectorLoweringPass(2));
passManager.addPass(createLinalgVectorLoweringPass(3));
passManager.addPass(createLinalgVectorLoweringPass(4));
passManager.addPass(createLinalgVectorLoweringPass(5));
passManager.addPass(createLinalgVectorLoweringPass(6));
void mlir::addLowerToVectorTransforms(OpPassManager &passManager,
LinalgVectorLoweringPassOptions options) {
for (int i = 0; i < 7; ++i) {
options.vectorLoweringStage = i;
passManager.addPass(createLinalgVectorLoweringPass(options));
passManager.addPass(createCanonicalizerPass());
passManager.addPass(createCSEPass());
}
}

//===----------------------------------------------------------------------===//
Expand Down
17 changes: 16 additions & 1 deletion iree/compiler/Codegen/Sandbox/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,33 @@ std::unique_ptr<OperationPass<FuncOp>> createLinalgSingleTilingExpertPass();
std::unique_ptr<OperationPass<FuncOp>> createLinalgSingleTilingExpertPass(
const LinalgSingleTilingExpertPassOptions &passOptions);

/// Struct to control pass options for `LinalgVectorLoweringPass` pass.
struct LinalgVectorLoweringPassOptions {
int vectorLoweringStage = 0;
std::string splitVectorTransfersTo = "";
std::string lowerVectorTransposeTo = "eltwise";
bool lowerVectorTransposeToAVX2 = false;
std::string lowerVectorMultiReductionTo = "innerparallel";
std::string lowerVectorContractionTo = "outerproduct";
bool unrollVectorTransfers = true;
int maxTransferRank = 1;
};

/// Creates a pass to drive the lowering of vector operations in a staged
/// manner.
std::unique_ptr<OperationPass<FuncOp>> createLinalgVectorLoweringPass(
int64_t vectorLoweringStage = 0);
std::unique_ptr<OperationPass<FuncOp>> createLinalgVectorLoweringPass(
const LinalgVectorLoweringPassOptions &options);

//===----------------------------------------------------------------------===//
// Transforms that tie together individual drivers.
//===----------------------------------------------------------------------===//

/// Add staged lowering of vector ops. `passManager` is expected to be a
/// `builtin.func` op pass manager.
void addLowerToVectorTransforms(OpPassManager &passManager);
void addLowerToVectorTransforms(OpPassManager &passManager,
LinalgVectorLoweringPassOptions options);

//===----------------------------------------------------------------------===//
// IREE specific pass creation methods to allow invocation from within IREEs
Expand Down

0 comments on commit ff1149c

Please sign in to comment.