Skip to content

Commit 8c651b4

Browse files
Detect ARM CPU features for host target and in runtime (Backport to release/18.x) (#8343)
Detect ARM CPU features for host target and in runtime (#8298) Adds feature detection for ARM CPUs to the runtime library and to the host target feature computation. Supports Windows, macOS, Linux, iOS, and Android. Also fix bug in Type::max() and Type::min() for float16. Fixes #4727 Fixes #6106 Fixes #7901 Fixes #7979 Fixes #8340 Co-authored-by: Alex Reinking <[email protected]>
1 parent 41bc134 commit 8c651b4

14 files changed

+422
-42
lines changed

Diff for: Makefile

+5
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,8 @@ RUNTIME_CPP_COMPONENTS = \
828828
hexagon_dma_pool \
829829
hexagon_host \
830830
ios_io \
831+
linux_aarch64_cpu_features \
832+
linux_arm_cpu_features \
831833
linux_clock \
832834
linux_host_cpu_count \
833835
linux_yield \
@@ -839,6 +841,8 @@ RUNTIME_CPP_COMPONENTS = \
839841
msan \
840842
msan_stubs \
841843
opencl \
844+
osx_aarch64_cpu_features \
845+
osx_arm_cpu_features \
842846
osx_clock \
843847
osx_get_symbol \
844848
osx_host_cpu_count \
@@ -873,6 +877,7 @@ RUNTIME_CPP_COMPONENTS = \
873877
wasm_cpu_features \
874878
webgpu_dawn \
875879
webgpu_emscripten \
880+
windows_aarch64_cpu_features_arm \
876881
windows_clock \
877882
windows_cuda \
878883
windows_d3d12compute_arm \

Diff for: src/LLVM_Runtime_Linker.cpp

+52-12
Original file line numberDiff line numberDiff line change
@@ -46,20 +46,31 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
4646
return std::unique_ptr<llvm::Module>(); \
4747
}
4848

49+
#define DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, bits) \
50+
do { \
51+
if (debug) { \
52+
return get_initmod_##mod##_##bits##_debug(context); \
53+
} else { \
54+
return get_initmod_##mod##_##bits(context); \
55+
} \
56+
} while (0)
57+
4958
#define DECLARE_CPP_INITMOD_LOOKUP(mod) \
5059
std::unique_ptr<llvm::Module> get_initmod_##mod(llvm::LLVMContext *context, bool bits_64, bool debug) { \
5160
if (bits_64) { \
52-
if (debug) { \
53-
return get_initmod_##mod##_64_debug(context); \
54-
} else { \
55-
return get_initmod_##mod##_64(context); \
56-
} \
61+
DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 64); \
62+
} else { \
63+
DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 32); \
64+
} \
65+
}
66+
67+
#define DECLARE_CPP_INITMOD_LOOKUP_64(mod) \
68+
std::unique_ptr<llvm::Module> get_initmod_##mod(llvm::LLVMContext *context, bool bits_64, bool debug) { \
69+
if (bits_64) { \
70+
DECLARE_CPP_INITMOD_LOOKUP_BITS(mod, 64); \
5771
} else { \
58-
if (debug) { \
59-
return get_initmod_##mod##_32_debug(context); \
60-
} else { \
61-
return get_initmod_##mod##_32(context); \
62-
} \
72+
internal_error << "No support for 32-bit initmod: " #mod; \
73+
return nullptr; /* appease warnings */ \
6374
} \
6475
}
6576

@@ -70,6 +81,11 @@ std::unique_ptr<llvm::Module> parse_bitcode_file(llvm::StringRef buf, llvm::LLVM
7081
DECLARE_INITMOD(mod##_64) \
7182
DECLARE_CPP_INITMOD_LOOKUP(mod)
7283

84+
#define DECLARE_CPP_INITMOD_64(mod) \
85+
DECLARE_INITMOD(mod##_64_debug) \
86+
DECLARE_INITMOD(mod##_64) \
87+
DECLARE_CPP_INITMOD_LOOKUP_64(mod)
88+
7389
#define DECLARE_LL_INITMOD(mod) \
7490
DECLARE_INITMOD(mod##_ll)
7591

@@ -183,18 +199,28 @@ DECLARE_NO_INITMOD(metal_objc_x86)
183199
DECLARE_LL_INITMOD(arm)
184200
DECLARE_LL_INITMOD(arm_no_neon)
185201
DECLARE_CPP_INITMOD(arm_cpu_features)
202+
DECLARE_CPP_INITMOD(linux_arm_cpu_features)
203+
DECLARE_CPP_INITMOD(osx_arm_cpu_features)
186204
#else
187205
DECLARE_NO_INITMOD(arm)
188206
DECLARE_NO_INITMOD(arm_no_neon)
189207
DECLARE_NO_INITMOD(arm_cpu_features)
208+
DECLARE_NO_INITMOD(linux_arm_cpu_features)
209+
DECLARE_NO_INITMOD(osx_arm_cpu_features)
190210
#endif // WITH_ARM
191211

192212
#ifdef WITH_AARCH64
193213
DECLARE_LL_INITMOD(aarch64)
194214
DECLARE_CPP_INITMOD(aarch64_cpu_features)
215+
DECLARE_CPP_INITMOD(linux_aarch64_cpu_features)
216+
DECLARE_CPP_INITMOD(osx_aarch64_cpu_features)
217+
DECLARE_CPP_INITMOD_64(windows_aarch64_cpu_features_arm)
195218
#else
196219
DECLARE_NO_INITMOD(aarch64)
197220
DECLARE_NO_INITMOD(aarch64_cpu_features)
221+
DECLARE_NO_INITMOD(linux_aarch64_cpu_features)
222+
DECLARE_NO_INITMOD(osx_aarch64_cpu_features)
223+
DECLARE_NO_INITMOD(windows_aarch64_cpu_features_arm)
198224
#endif // WITH_AARCH64
199225

200226
#ifdef WITH_NVPTX
@@ -1206,9 +1232,23 @@ std::unique_ptr<llvm::Module> get_initial_module_for_target(Target t, llvm::LLVM
12061232
}
12071233
if (t.arch == Target::ARM) {
12081234
if (t.bits == 64) {
1209-
modules.push_back(get_initmod_aarch64_cpu_features(c, bits_64, debug));
1235+
if (t.os == Target::Android || t.os == Target::Linux) {
1236+
modules.push_back(get_initmod_linux_aarch64_cpu_features(c, bits_64, debug));
1237+
} else if (t.os == Target::OSX || t.os == Target::IOS) {
1238+
modules.push_back(get_initmod_osx_aarch64_cpu_features(c, bits_64, debug));
1239+
} else if (t.os == Target::Windows) {
1240+
modules.push_back(get_initmod_windows_aarch64_cpu_features_arm(c, bits_64, debug));
1241+
} else {
1242+
modules.push_back(get_initmod_aarch64_cpu_features(c, bits_64, debug));
1243+
}
12101244
} else {
1211-
modules.push_back(get_initmod_arm_cpu_features(c, bits_64, debug));
1245+
if (t.os == Target::Android || t.os == Target::Linux) {
1246+
modules.push_back(get_initmod_linux_arm_cpu_features(c, bits_64, debug));
1247+
} else if (t.os == Target::OSX || t.os == Target::IOS) {
1248+
modules.push_back(get_initmod_osx_arm_cpu_features(c, bits_64, debug));
1249+
} else {
1250+
modules.push_back(get_initmod_arm_cpu_features(c, bits_64, debug));
1251+
}
12121252
}
12131253
}
12141254
if (t.arch == Target::POWERPC) {

Diff for: src/Target.cpp

+115-7
Original file line numberDiff line numberDiff line change
@@ -21,23 +21,50 @@
2121
#endif
2222

2323
#ifdef _MSC_VER
24+
#define NOMINMAX
25+
#define WIN32_LEAN_AND_MEAN
2426
#include <intrin.h>
27+
#include <windows.h>
2528
#endif // _MSC_VER
2629

30+
#ifdef __APPLE__
31+
#include <mach/machine.h>
32+
#include <sys/sysctl.h>
33+
#include <sys/types.h>
34+
#endif
35+
36+
#if defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
37+
#include <asm/hwcap.h>
38+
#include <sys/auxv.h>
39+
#ifndef HWCAP_ASIMDHP
40+
#define HWCAP_ASIMDHP 0
41+
#endif
42+
#ifndef HWCAP_ASIMDDP
43+
#define HWCAP_ASIMDDP 0
44+
#endif
45+
#ifndef HWCAP_SVE
46+
#define HWCAP_SVE 0
47+
#endif
48+
#ifndef HWCAP2_SVE2
49+
#define HWCAP2_SVE2 0
50+
#endif
51+
#endif
52+
2753
namespace Halide {
2854

2955
using std::string;
3056
using std::vector;
3157

3258
namespace {
3359

34-
#ifdef _MSC_VER
35-
static void cpuid(int info[4], int infoType, int extra) {
60+
#if defined(_M_IX86) || defined(_M_AMD64)
61+
62+
void cpuid(int info[4], int infoType, int extra) {
3663
__cpuidex(info, infoType, extra);
3764
}
38-
#else
3965

40-
#if defined(__x86_64__) || defined(__i386__)
66+
#elif defined(__x86_64__) || defined(__i386__)
67+
4168
// CPU feature detection code taken from ispc
4269
// (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll)
4370

@@ -47,10 +74,10 @@ void cpuid(int info[4], int infoType, int extra) {
4774
: "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
4875
: "0"(infoType), "2"(extra));
4976
}
50-
#endif
77+
5178
#endif
5279

53-
#if defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER)
80+
#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)
5481

5582
enum class VendorSignatures {
5683
Unknown,
@@ -143,6 +170,29 @@ Target::Processor get_amd_processor(unsigned family, unsigned model, bool have_s
143170

144171
#endif // defined(__x86_64__) || defined(__i386__) || defined(_MSC_VER)
145172

173+
#ifdef __APPLE__
174+
175+
template<typename T>
176+
std::optional<T> getsysctl(const char *name) {
177+
T value;
178+
size_t size = sizeof(value);
179+
if (sysctlbyname(name, &value, &size, nullptr, 0)) {
180+
return std::nullopt;
181+
}
182+
return std::make_optional(value);
183+
}
184+
185+
bool sysctl_is_set(const char *name) {
186+
return getsysctl<int>(name).value_or(0);
187+
}
188+
189+
bool is_armv7s() {
190+
return getsysctl<cpu_type_t>("hw.cputype") == CPU_TYPE_ARM &&
191+
getsysctl<cpu_subtype_t>("hw.cpusubtype") == CPU_SUBTYPE_ARM_V7S;
192+
}
193+
194+
#endif // __APPLE__
195+
146196
Target calculate_host_target() {
147197
Target::OS os = Target::OSUnknown;
148198
#ifdef __linux__
@@ -164,8 +214,66 @@ Target calculate_host_target() {
164214
#if __riscv
165215
Target::Arch arch = Target::RISCV;
166216
#else
167-
#if defined(__arm__) || defined(__aarch64__)
217+
#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
168218
Target::Arch arch = Target::ARM;
219+
220+
#ifdef __APPLE__
221+
if (is_armv7s()) {
222+
initial_features.push_back(Target::ARMv7s);
223+
}
224+
225+
if (sysctl_is_set("hw.optional.arm.FEAT_DotProd")) {
226+
initial_features.push_back(Target::ARMDotProd);
227+
}
228+
229+
if (sysctl_is_set("hw.optional.arm.FEAT_FP16")) {
230+
initial_features.push_back(Target::ARMFp16);
231+
}
232+
#endif
233+
234+
#ifdef __linux__
235+
unsigned long hwcaps = getauxval(AT_HWCAP);
236+
unsigned long hwcaps2 = getauxval(AT_HWCAP2);
237+
238+
if (hwcaps & HWCAP_ASIMDDP) {
239+
initial_features.push_back(Target::ARMDotProd);
240+
}
241+
242+
if (hwcaps & HWCAP_ASIMDHP) {
243+
initial_features.push_back(Target::ARMFp16);
244+
}
245+
246+
if (hwcaps & HWCAP_SVE) {
247+
initial_features.push_back(Target::SVE);
248+
}
249+
250+
if (hwcaps2 & HWCAP2_SVE2) {
251+
initial_features.push_back(Target::SVE2);
252+
}
253+
#endif
254+
255+
#ifdef _MSC_VER
256+
257+
// Magic value from: https://github.com/dotnet/runtime/blob/7e977dcbe5efaeec2c75ed0c3e200c85b2e55522/src/native/minipal/cpufeatures.c#L19
258+
#define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE (46)
259+
260+
// This is the strategy used by Google's cpuinfo library for
261+
// detecting fp16 arithmetic support on Windows.
262+
if (!IsProcessorFeaturePresent(PF_FLOATING_POINT_EMULATED) &&
263+
IsProcessorFeaturePresent(PF_ARM_FMAC_INSTRUCTIONS_AVAILABLE)) {
264+
initial_features.push_back(Target::ARMFp16);
265+
}
266+
267+
if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
268+
initial_features.push_back(Target::ARMDotProd);
269+
}
270+
271+
if (IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE)) {
272+
initial_features.push_back(Target::SVE);
273+
}
274+
275+
#endif
276+
169277
#else
170278
#if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__))
171279
Target::Arch arch = Target::POWERPC;

Diff for: src/Type.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ Halide::Expr Type::max() const {
3535
} else {
3636
internal_assert(is_float());
3737
if (bits() == 16) {
38-
return Internal::FloatImm::make(*this, 65504.0);
38+
return Internal::FloatImm::make(*this, (double)float16_t::make_infinity());
3939
} else if (bits() == 32) {
4040
return Internal::FloatImm::make(*this, std::numeric_limits<float>::infinity());
4141
} else if (bits() == 64) {
@@ -59,7 +59,7 @@ Halide::Expr Type::min() const {
5959
} else {
6060
internal_assert(is_float());
6161
if (bits() == 16) {
62-
return Internal::FloatImm::make(*this, -65504.0);
62+
return Internal::FloatImm::make(*this, (double)float16_t::make_negative_infinity());
6363
} else if (bits() == 32) {
6464
return Internal::FloatImm::make(*this, -std::numeric_limits<float>::infinity());
6565
} else if (bits() == 64) {

Diff for: src/Util.cpp

+8-1
Original file line numberDiff line numberDiff line change
@@ -859,7 +859,14 @@ void run_with_large_stack(const std::function<void()> &action) {
859859
// Portable bit-counting methods
860860
int popcount64(uint64_t x) {
861861
#ifdef _MSC_VER
862-
#if defined(_WIN64)
862+
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64_EC)
863+
int popcnt = 0;
864+
while (x) {
865+
x &= x - 1;
866+
popcnt++;
867+
}
868+
return popcnt;
869+
#elif defined(_WIN64)
863870
return __popcnt64(x);
864871
#else
865872
return __popcnt((uint32_t)(x >> 32)) + __popcnt((uint32_t)(x & 0xffffffff));

Diff for: src/runtime/CMakeLists.txt

+5
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ set(RUNTIME_CPP
3232
hexagon_dma_pool
3333
hexagon_host
3434
ios_io
35+
linux_aarch64_cpu_features
36+
linux_arm_cpu_features
3537
linux_clock
3638
linux_host_cpu_count
3739
linux_yield
@@ -43,6 +45,8 @@ set(RUNTIME_CPP
4345
msan
4446
msan_stubs
4547
opencl
48+
osx_aarch64_cpu_features
49+
osx_arm_cpu_features
4650
osx_clock
4751
osx_get_symbol
4852
osx_host_cpu_count
@@ -80,6 +84,7 @@ set(RUNTIME_CPP
8084
# webgpu
8185
webgpu_dawn
8286
webgpu_emscripten
87+
windows_aarch64_cpu_features_arm
8388
windows_clock
8489
windows_cuda
8590
windows_d3d12compute_arm

0 commit comments

Comments
 (0)