Skip to content

Commit

Permalink
[CPU] Limit vectorization tile sizes for SVE (#18846)
Browse files Browse the repository at this point in the history
This prevents large vector sizes. A regression test is included with a
linalg.pooling_nchw_max operation that currently fails to compile with

error: One or more operations with large vector sizes (8192 bytes) were
found:

when SVE is enabled, even though SVE isn't used.

---------

Signed-off-by: Cullen Rhodes <[email protected]>
  • Loading branch information
c-rhodes authored Oct 22, 2024
1 parent d0269f3 commit 4cc6671
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 9 deletions.
7 changes: 2 additions & 5 deletions compiler/src/iree/compiler/Codegen/LLVMCPU/KernelDispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -792,11 +792,8 @@ static int getRegisterSpaceBitsIfKnown(IREE::HAL::ExecutableTargetAttr target) {
return 16 * 128;
}
} else if (isAArch64(target)) {
// Can't determine register space size at compile time on SVE.
if (hasFeature(target, "+sve") || hasFeature(target, "+sve2")) {
return 0;
}
// 32 NEON registers (128-bit each).
// 32 NEON/SVE registers (at least 128-bit each, returns the base size for
// SVE).
return 32 * 128;
} else {
// Don't know register space size as a compile-time constant on other
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func.func @matmul_tensors() attributes {hal.executable.target = #executable_targ
return
}

// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, [16], 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [4, [16], 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: func.func @matmul_tensors()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
Expand Down Expand Up @@ -118,7 +118,7 @@ func.func @matmul_tensors() attributes {hal.executable.target = #executable_targ
return
}

// DISABLE-ARM-SME-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, [16], 0], [0, 0, 1], [0, 0, 0]]>
// DISABLE-ARM-SME-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [4, [16], 0], [0, 0, 1], [0, 0, 0]]>
// DISABLE-ARM-SME-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// DISABLE-ARM-SME: func.func @matmul_tensors()
// DISABLE-ARM-SME-SAME: translation_info = #[[TRANSLATION]]
Expand Down Expand Up @@ -179,8 +179,8 @@ func.func @matmul_with_fill() attributes {hal.executable.target = #executable_ta
return
}

// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [8, [16]], [0, 0], [0, 0]]>
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [8, [16], 0], [0, 0, 1], [0, 0, 0]]>
// CHECK-DAG: #[[CONFIG1:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64], [4, [16]], [0, 0], [0, 0]]>
// CHECK-DAG: #[[CONFIG2:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0], [4, [16], 0], [0, 0, 1], [0, 0, 0]]>
// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUDoubleTilingExpert>
// CHECK: func.func @matmul_with_fill()
// CHECK-SAME: translation_info = #[[TRANSLATION]]
Expand Down Expand Up @@ -217,3 +217,34 @@ func.func @depthwise_conv() attributes {hal.executable.target = #executable_targ
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.depthwise_conv_2d_nhwc_hwc
// CHECK-SAME: lowering_config = #[[CONFIG]]

// -----

// Regression test. SVE isn't used (scalable vectorizaton of this op is not yet
// supported), but used to fail to compile when SVE was enabled due to tile
// sizes leading to large vectors.

#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
#executable_target_embedded_elf_arm_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-arm_64", {cpu_features = "+sve", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", native_vector_size = 16 : index, target_triple = "aarch64-none-elf"}>
func.func @pooling_nchw_max(%arg0: !flow.dispatch.tensor<readonly:tensor<1x64x114x114xf32>>, %arg1: !flow.dispatch.tensor<writeonly:tensor<1x64x56x56xf32>>) attributes {hal.executable.target = #executable_target_embedded_elf_arm_64_} {
%cst = arith.constant 0.0 : f32
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) : !flow.dispatch.tensor<readonly:tensor<1x64x114x114xf32>>
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) : !flow.dispatch.tensor<writeonly:tensor<1x64x56x56xf32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 64, 114, 114], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x64x114x114xf32>> -> tensor<1x64x114x114xf32>
%3 = tensor.empty() : tensor<1x64x56x56xf32>
%4 = tensor.empty() : tensor<3x3xf32>
%5 = linalg.fill ins(%cst : f32) outs(%3 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
%6 = linalg.pooling_nchw_max {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%2, %4 : tensor<1x64x114x114xf32>, tensor<3x3xf32>) outs(%3 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
flow.dispatch.tensor.store %6, %1, offsets = [0, 0, 0, 0], sizes = [1, 64, 56, 56], strides = [1, 1, 1, 1] : tensor<1x64x56x56xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x64x56x56xf32>>
return
}

// CHECK-DAG: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[0, 32, 56, 8, 0, 0], [1, 2, 1, 8, 0, 0], [0, 0, 0, 0, 1, 3], [0, 0, 0, 0, 0, 0]]>
// CHECK-DAG: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<CPUConvTileAndDecomposeExpert>
// CHECK: func.func @pooling_nchw_max
// CHECK-SAME: translation_info = #[[TRANSLATION]]
// CHECK: linalg.pooling_nchw_max
// CHECK-SAME: lowering_config = #[[CONFIG]]

0 comments on commit 4cc6671

Please sign in to comment.