From 4cc6671320e881d4b3f112417fcd253ad6b031ce Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Tue, 22 Oct 2024 08:18:56 +0100 Subject: [PATCH] [CPU] Limit vectorization tile sizes for SVE (#18846) This prevents large vector sizes. A regression test is included with a linalg.pooling_nchw_max operation that currently fails to compile with error: One or more operations with large vector sizes (8192 bytes) were found: when SVE is enabled, even though SVE isn't used. --------- Signed-off-by: Cullen Rhodes --- .../Codegen/LLVMCPU/KernelDispatch.cpp | 7 +--- .../select_aarch64_sve_lowering_strategy.mlir | 39 +++++++++++++++++-- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp index 6f9983454af5..5111b7668958 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp @@ -792,11 +792,8 @@ static int getRegisterSpaceBitsIfKnown(IREE::HAL::ExecutableTargetAttr target) { return 16 * 128; } } else if (isAArch64(target)) { - // Can't determine register space size at compile time on SVE. - if (hasFeature(target, "+sve") || hasFeature(target, "+sve2")) { - return 0; - } - // 32 NEON registers (128-bit each). + // 32 NEON/SVE registers (at least 128-bit each, returns the base size for + // SVE). return 32 * 128; } else { // Don't know register space size as a compile-time constant on other diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir index 757a039ed119..1308442f23bf 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/select_aarch64_sve_lowering_strategy.mlir @@ -28,7 +28,7 @@ func.func @matmul_tensors() attributes {hal.executable.target = #executable_targ return } -// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config +// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info // CHECK: func.func @matmul_tensors() // CHECK-SAME: translation_info = #[[TRANSLATION]] @@ -118,7 +118,7 @@ func.func @matmul_tensors() attributes {hal.executable.target = #executable_targ return } -// DISABLE-ARM-SME-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config +// DISABLE-ARM-SME-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config // DISABLE-ARM-SME-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info // DISABLE-ARM-SME: func.func @matmul_tensors() // DISABLE-ARM-SME-SAME: translation_info = #[[TRANSLATION]] @@ -179,8 +179,8 @@ func.func @matmul_with_fill() attributes {hal.executable.target = #executable_ta return } -// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config -// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config +// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config +// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config // CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info // CHECK: func.func @matmul_with_fill() // CHECK-SAME: translation_info = #[[TRANSLATION]] @@ -217,3 +217,34 @@ func.func @depthwise_conv() attributes {hal.executable.target = #executable_targ // CHECK-SAME: translation_info = #[[TRANSLATION]] // CHECK: linalg.depthwise_conv_2d_nhwc_hwc // CHECK-SAME: lowering_config = #[[CONFIG]] + +// ----- + +// Regression test. SVE isn't used (scalable vectorizaton of this op is not yet +// supported), but used to fail to compile when SVE was enabled due to tile +// sizes leading to large vectors. + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding +]> +#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}> +func.func @pooling_nchw_max(%arg0: !flow.dispatch.tensor>, %arg1: !flow.dispatch.tensor>) attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} { + %cst = arith.constant 0.0 : f32 + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) : !flow.dispatch.tensor> + %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 64, 114, 114], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x64x114x114xf32> + %3 = tensor.empty() : tensor<1x64x56x56xf32> + %4 = tensor.empty() : tensor<3x3xf32> + %5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32> + %6 = linalg.pooling_nchw_max {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%2, %4 : tensor<1x64x114x114xf32>, tensor<3x3xf32>) outs(%3 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32> + flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 64, 56, 56], strides = [1, 1, 1, 1] : tensor<1x64x56x56xf32> -> !flow.dispatch.tensor> + return +} + +// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config +// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info +// CHECK: func.func @pooling_nchw_max +// CHECK-SAME: translation_info = #[[TRANSLATION]] +// CHECK: linalg.pooling_nchw_max +// CHECK-SAME: lowering_config = #[[CONFIG]]