Skip to content

Commit 4797d19

Browse files
authored
ruapu cpu isa detection (#5341)
1 parent 592721d commit 4797d19

File tree

3 files changed

+331
-202
lines changed

3 files changed

+331
-202
lines changed

codeformat.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
# we run clang-format and astyle twice to get stable format output
44

55
format_code() {
6-
find src/ tools/ tests/ examples/ benchmark/ python/ -type f -name '*.c' -o -name '*.cpp' -o -name '*.cc' -o -name '*.h' | grep -v python/pybind11 | grep -v stb_image | xargs -i clang-format -i {}
6+
find src/ tools/ tests/ examples/ benchmark/ python/ -type f -name '*.c' -o -name '*.cpp' -o -name '*.cc' -o -name '*.h' | grep -v python/pybind11 | grep -v stb_image | grep -v ruapu | xargs -i clang-format -i {}
77
astyle -n -r "benchmark/*.h,*.cpp,*.cc" "tests/*.h,*.cpp,*.cc" "tools/*.h,*.cpp,*.cc" "examples/*.h,*.cpp,*.cc"
8-
astyle -n -r "src/*.h,*.cpp,*.cc" --exclude=src/stb_image.h --exclude=src/stb_image_write.h
8+
astyle -n -r "src/*.h,*.cpp,*.cc" --exclude=src/stb_image.h --exclude=src/stb_image_write.h --exclude=src/ruapu.h
99
astyle -n -r "python/*.h,*.cpp,*.cc" --exclude=python/pybind11
1010
}
1111

src/cpu.cpp

+27-200
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@
119119
#include <immintrin.h>
120120
#endif
121121

122+
#define RUAPU_IMPLEMENTATION
123+
#include "ruapu.h"
124+
122125
// topology info
123126
static int g_cpucount;
124127
static int g_physical_cpucount;
@@ -185,7 +188,6 @@ static int g_cpu_is_arm_a53_a55;
185188
#endif // __aarch64__
186189
#endif // defined __ANDROID__ || defined __linux__
187190

188-
static bool g_is_being_debugged = false;
189191
static bool is_being_debugged()
190192
{
191193
#if defined _WIN32
@@ -240,186 +242,6 @@ static bool is_being_debugged()
240242
#endif
241243
}
242244

243-
#if defined _WIN32
244-
#if WINAPI_FAMILY == WINAPI_FAMILY_APP
245-
static int detectisa(const void* /*some_inst*/)
246-
{
247-
// uwp does not support seh :(
248-
return 0;
249-
}
250-
#else // WINAPI_FAMILY == WINAPI_FAMILY_APP
251-
static int g_sigill_caught = 0;
252-
static jmp_buf g_jmpbuf;
253-
254-
static LONG CALLBACK catch_sigill(struct _EXCEPTION_POINTERS* ExceptionInfo)
255-
{
256-
if (ExceptionInfo->ExceptionRecord->ExceptionCode == EXCEPTION_ILLEGAL_INSTRUCTION)
257-
{
258-
g_sigill_caught = 1;
259-
longjmp(g_jmpbuf, -1);
260-
}
261-
262-
return EXCEPTION_CONTINUE_SEARCH;
263-
}
264-
265-
static int detectisa(const void* some_inst)
266-
{
267-
if (g_is_being_debugged)
268-
return 0;
269-
270-
g_sigill_caught = 0;
271-
272-
PVOID eh = AddVectoredExceptionHandler(1, catch_sigill);
273-
274-
if (setjmp(g_jmpbuf) == 0)
275-
{
276-
((void (*)())some_inst)();
277-
}
278-
279-
RemoveVectoredExceptionHandler(eh);
280-
281-
return g_sigill_caught ? 0 : 1;
282-
}
283-
#endif // WINAPI_FAMILY == WINAPI_FAMILY_APP
284-
285-
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
286-
#ifdef _MSC_VER
287-
#define DEFINE_INSTCODE(name, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned char name[] = {__VA_ARGS__, 0xc3};
288-
#else
289-
#define DEFINE_INSTCODE(name, ...) __attribute__((section(".text"))) static unsigned char name[] = {__VA_ARGS__, 0xc3};
290-
#endif
291-
#elif __aarch64__
292-
#ifdef _MSC_VER
293-
#define DEFINE_INSTCODE(name, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned int name[] = {__VA_ARGS__, 0xd65f03c0};
294-
#else
295-
#define DEFINE_INSTCODE(name, ...) __attribute__((section(".text"))) static unsigned int name[] = {__VA_ARGS__, 0xd65f03c0};
296-
#endif
297-
#elif __arm__
298-
#ifdef _MSC_VER
299-
#define DEFINE_INSTCODE(name, ...) __pragma(section(".text")) __declspec(allocate(".text")) static unsigned int name[] = {__VA_ARGS__, 0x4770bf00};
300-
#else
301-
#define DEFINE_INSTCODE(name, ...) __attribute__((section(".text"))) static unsigned int name[] = {__VA_ARGS__, 0x4770bf00};
302-
#endif
303-
#endif
304-
305-
#elif defined __ANDROID__ || defined __linux__ || defined __APPLE__
306-
static int g_sigill_caught = 0;
307-
static sigjmp_buf g_jmpbuf;
308-
309-
static void catch_sigill(int /*signo*/, siginfo_t* /*si*/, void* /*data*/)
310-
{
311-
g_sigill_caught = 1;
312-
siglongjmp(g_jmpbuf, -1);
313-
}
314-
315-
static int detectisa(void (*some_inst)())
316-
{
317-
if (g_is_being_debugged)
318-
return 0;
319-
320-
g_sigill_caught = 0;
321-
322-
struct sigaction sa;
323-
struct sigaction old_sa;
324-
memset(&sa, 0, sizeof(sa));
325-
sa.sa_sigaction = catch_sigill;
326-
sa.sa_flags = SA_ONSTACK | SA_RESTART | SA_SIGINFO;
327-
sigaction(SIGILL, &sa, &old_sa);
328-
329-
if (sigsetjmp(g_jmpbuf, 1) == 0)
330-
{
331-
some_inst();
332-
}
333-
334-
sigaction(SIGILL, &old_sa, NULL);
335-
336-
return g_sigill_caught ? 0 : 1;
337-
}
338-
339-
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
340-
#define DEFINE_INSTCODE(name, ...) \
341-
static void name() \
342-
{ \
343-
asm volatile(".byte " #__VA_ARGS__ \
344-
: \
345-
: \
346-
:); \
347-
};
348-
#elif __aarch64__
349-
#define DEFINE_INSTCODE(name, ...) \
350-
static void name() \
351-
{ \
352-
asm volatile(".word " #__VA_ARGS__ \
353-
: \
354-
: \
355-
:); \
356-
};
357-
#elif __arm__
358-
#define DEFINE_INSTCODE(name, ...) \
359-
static void name() \
360-
{ \
361-
asm volatile(".word " #__VA_ARGS__ \
362-
: \
363-
: \
364-
:); \
365-
};
366-
#endif
367-
368-
#endif // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
369-
370-
#if defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
371-
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
372-
DEFINE_INSTCODE(some_mmx, 0x0f, 0xdb, 0xc0) // pand mm0,mm0
373-
DEFINE_INSTCODE(some_sse, 0x0f, 0x54, 0xc0) // andps xmm0,xmm0
374-
DEFINE_INSTCODE(some_sse2, 0x66, 0x0f, 0xfe, 0xc0) // paddd xmm0,xmm0
375-
DEFINE_INSTCODE(some_sse3, 0xf2, 0x0f, 0x7c, 0xc0) // haddps xmm0,xmm0
376-
DEFINE_INSTCODE(some_ssse3, 0x66, 0x0f, 0x38, 0x06, 0xc0) // phsubd xmm0,xmm0
377-
DEFINE_INSTCODE(some_sse41, 0x66, 0x0f, 0x38, 0x3d, 0xc0) // pmaxsd xmm0,xmm0
378-
DEFINE_INSTCODE(some_sse42, 0x66, 0x0f, 0x38, 0x37, 0xc0) // pcmpgtq xmm0,xmm0
379-
DEFINE_INSTCODE(some_sse4a, 0x66, 0x0f, 0x79, 0xc0) // extrq xmm0,xmm0
380-
DEFINE_INSTCODE(some_xop, 0x8f, 0xe8, 0x78, 0xb6, 0xc0, 0x00) // vpmadcswd %xmm0,%xmm0,%xmm0,%xmm0
381-
DEFINE_INSTCODE(some_avx, 0xc5, 0xfc, 0x54, 0xc0) // vandps ymm0,ymm0,ymm0
382-
DEFINE_INSTCODE(some_f16c, 0xc4, 0xe2, 0x7d, 0x13, 0xc0) // vcvtph2ps ymm0,xmm0
383-
DEFINE_INSTCODE(some_fma, 0xc4, 0xe2, 0x7d, 0x98, 0xc0) // vfmadd132ps ymm0,ymm0,ymm0
384-
DEFINE_INSTCODE(some_avx2, 0xc5, 0xfd, 0xfe, 0xc0) // vpaddd ymm0,ymm0,ymm0
385-
DEFINE_INSTCODE(some_avx512f, 0x62, 0xf1, 0x7c, 0x48, 0x58, 0xc0) // vaddps zmm0,zmm0,zmm0
386-
DEFINE_INSTCODE(some_avx512bw, 0x62, 0xf1, 0x7d, 0x48, 0xfd, 0xc0) // vpaddw zmm0,zmm0,zmm0
387-
DEFINE_INSTCODE(some_avx512cd, 0x62, 0xf2, 0xfd, 0x48, 0x44, 0xc0) // vplzcntq zmm0,zmm0
388-
DEFINE_INSTCODE(some_avx512dq, 0x62, 0xf1, 0x7c, 0x48, 0x54, 0xc0) // vandps zmm0,zmm0,zmm0
389-
DEFINE_INSTCODE(some_avx512vl, 0x62, 0xf2, 0xfd, 0x28, 0x1f, 0xc0) // vpabsq ymm0,ymm0
390-
DEFINE_INSTCODE(some_avx512vnni, 0x62, 0xf2, 0x7d, 0x48, 0x52, 0xc0) // vpdpwssd %zmm0,%zmm0,%zmm0
391-
DEFINE_INSTCODE(some_avx512bf16, 0x62, 0xf2, 0x7e, 0x48, 0x52, 0xc0) // vdpbf16ps %zmm0,%zmm0,%zmm0
392-
DEFINE_INSTCODE(some_avx512ifma, 0x62, 0xf2, 0xfd, 0x48, 0xb4, 0xc0) // vpmadd52luq %zmm0,%zmm0,%zmm0
393-
DEFINE_INSTCODE(some_avx512vbmi, 0x62, 0xf2, 0x7d, 0x48, 0x75, 0xc0) // vpermi2b %zmm0,%zmm0,%zmm0
394-
DEFINE_INSTCODE(some_avx512vbmi2, 0x62, 0xf2, 0x7d, 0x48, 0x71, 0xc0) // vpshldvd %zmm0,%zmm0,%zmm0
395-
DEFINE_INSTCODE(some_avx512fp16, 0x62, 0xf6, 0x7d, 0x48, 0x98, 0xc0) // vfmadd132ph %zmm0,%zmm0,%zmm0
396-
DEFINE_INSTCODE(some_avxvnni, 0x62, 0xf2, 0x7d, 0x28, 0x52, 0xc0) // vpdpwssd ymm0,ymm0,ymm0
397-
DEFINE_INSTCODE(some_avxvnniint8, 0xc4, 0xe2, 0x7f, 0x50, 0xc0) // vpdpbssd ymm0,ymm0,ymm0
398-
DEFINE_INSTCODE(some_avxifma, 0x62, 0xf2, 0xfd, 0x28, 0xb4, 0xc0) // vpmadd52luq %ymm0,%ymm0,%ymm0
399-
400-
#elif __aarch64__
401-
DEFINE_INSTCODE(some_neon, 0x4e20d400) // fadd v0.4s,v0.4s,v0.4s
402-
DEFINE_INSTCODE(some_vfpv4, 0x0e216800) // fcvtn v0.4h,v0.4s
403-
DEFINE_INSTCODE(some_cpuid, 0xd5380000) // mrs x0,midr_el1
404-
DEFINE_INSTCODE(some_asimdhp, 0x0e401400) // fadd v0.4h,v0.4h,v0.4h
405-
DEFINE_INSTCODE(some_asimddp, 0x4e809400) // sdot v0.4h,v0.16b,v0.16b
406-
DEFINE_INSTCODE(some_asimdfhm, 0x4e20ec00) // fmlal v0.4s,v0.4h,v0.4h
407-
DEFINE_INSTCODE(some_bf16, 0x6e40ec00) // bfmmla v0.4h,v0.8h,v0.8h
408-
DEFINE_INSTCODE(some_i8mm, 0x4e80a400) // smmla v0.4h,v0.16b,v0.16b
409-
DEFINE_INSTCODE(some_sve, 0x65608000) // fmad z0.h,p0/m,z0.h,z0.h
410-
DEFINE_INSTCODE(some_sve2, 0x44405000) // smlslb z0.h,z0.b,z0.b
411-
DEFINE_INSTCODE(some_svebf16, 0x6460e400) // bfmmla z0.s,z0.h,z0.h
412-
DEFINE_INSTCODE(some_svei8mm, 0x45009800) // smmla z0.s,z0.b,z0.b
413-
DEFINE_INSTCODE(some_svef32mm, 0x64a0e400) // fmmla z0.s,z0.s,z0.s
414-
415-
#elif __arm__
416-
DEFINE_INSTCODE(some_edsp, 0x0000fb20) // smlad r0,r0,r0,r0
417-
DEFINE_INSTCODE(some_neon, 0x0d40ef00) // vadd.f32 q0,q0,q0
418-
DEFINE_INSTCODE(some_vfpv4, 0x0600ffb6) // vcvt.f16.f32 d0,q0
419-
420-
#endif
421-
#endif // defined _WIN32 || defined __ANDROID__ || defined __linux__ || defined __APPLE__
422-
423245
#if defined __ANDROID__ || defined __linux__
424246

425247
#define AT_HWCAP 16
@@ -765,7 +587,7 @@ static int get_cpu_support_x86_avx2()
765587
static int get_cpu_support_x86_avx_vnni()
766588
{
767589
#if __APPLE__
768-
return detectisa(some_avxvnni);
590+
return ruapu_supports("avxvnni");
769591
#else
770592
unsigned int cpu_info[4] = {0};
771593
x86_cpuid(0, cpu_info);
@@ -791,7 +613,7 @@ static int get_cpu_support_x86_avx_vnni()
791613
static int get_cpu_support_x86_avx512()
792614
{
793615
#if __APPLE__
794-
return detectisa(some_avx512f) && detectisa(some_avx512bw) && detectisa(some_avx512cd) && detectisa(some_avx512dq) && detectisa(some_avx512vl);
616+
return ruapu_supports("avx512f") && ruapu_supports("avx512bw") && ruapu_supports("avx512cd") && ruapu_supports("avx512dq") && ruapu_supports("avx512vl");
795617
#else
796618
unsigned int cpu_info[4] = {0};
797619
x86_cpuid(0, cpu_info);
@@ -821,7 +643,7 @@ static int get_cpu_support_x86_avx512()
821643
static int get_cpu_support_x86_avx512_vnni()
822644
{
823645
#if __APPLE__
824-
return detectisa(some_avx512vnni);
646+
return ruapu_supports("avx512vnni");
825647
#else
826648
unsigned int cpu_info[4] = {0};
827649
x86_cpuid(0, cpu_info);
@@ -851,7 +673,7 @@ static int get_cpu_support_x86_avx512_vnni()
851673
static int get_cpu_support_x86_avx512_bf16()
852674
{
853675
#if __APPLE__
854-
return detectisa(some_avx512bf16);
676+
return ruapu_supports("avx512bf16");
855677
#else
856678
unsigned int cpu_info[4] = {0};
857679
x86_cpuid(0, cpu_info);
@@ -877,7 +699,7 @@ static int get_cpu_support_x86_avx512_bf16()
877699
static int get_cpu_support_x86_avx512_fp16()
878700
{
879701
#if __APPLE__
880-
return detectisa(some_avx512fp16);
702+
return ruapu_supports("avx512fp16");
881703
#else
882704
unsigned int cpu_info[4] = {0};
883705
x86_cpuid(0, cpu_info);
@@ -2035,25 +1857,30 @@ static void initialize_global_cpu_info()
20351857
g_powersave = 0;
20361858
initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);
20371859

2038-
g_is_being_debugged = is_being_debugged();
1860+
#if (defined _WIN32 && (__aarch64__ || __arm__)) || __APPLE__
1861+
if (!is_being_debugged())
1862+
{
1863+
ruapu_init();
1864+
}
1865+
#endif
20391866

20401867
#if defined _WIN32
20411868
#if __aarch64__
2042-
g_cpu_support_arm_cpuid = detectisa(some_cpuid);
2043-
g_cpu_support_arm_asimdhp = detectisa(some_asimdhp) || IsProcessorFeaturePresent(43); // dp implies hp
2044-
g_cpu_support_arm_asimddp = detectisa(some_asimddp) || IsProcessorFeaturePresent(43); // 43 is PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
2045-
g_cpu_support_arm_asimdfhm = detectisa(some_asimdfhm);
2046-
g_cpu_support_arm_bf16 = detectisa(some_bf16);
2047-
g_cpu_support_arm_i8mm = detectisa(some_i8mm);
2048-
g_cpu_support_arm_sve = detectisa(some_sve);
2049-
g_cpu_support_arm_sve2 = detectisa(some_sve2);
2050-
g_cpu_support_arm_svebf16 = detectisa(some_svebf16);
2051-
g_cpu_support_arm_svei8mm = detectisa(some_svei8mm);
2052-
g_cpu_support_arm_svef32mm = detectisa(some_svef32mm);
1869+
g_cpu_support_arm_cpuid = ruapu_supports("cpuid");
1870+
g_cpu_support_arm_asimdhp = ruapu_supports("asimdhp") || IsProcessorFeaturePresent(43); // dp implies hp
1871+
g_cpu_support_arm_asimddp = ruapu_supports("asimddp") || IsProcessorFeaturePresent(43); // 43 is PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
1872+
g_cpu_support_arm_asimdfhm = ruapu_supports("asimdfhm");
1873+
g_cpu_support_arm_bf16 = ruapu_supports("bf16");
1874+
g_cpu_support_arm_i8mm = ruapu_supports("i8mm");
1875+
g_cpu_support_arm_sve = ruapu_supports("sve");
1876+
g_cpu_support_arm_sve2 = ruapu_supports("sve2");
1877+
g_cpu_support_arm_svebf16 = ruapu_supports("svebf16");
1878+
g_cpu_support_arm_svei8mm = ruapu_supports("svei8mm");
1879+
g_cpu_support_arm_svef32mm = ruapu_supports("svef32mm");
20531880
#elif __arm__
2054-
g_cpu_support_arm_edsp = detectisa(some_edsp);
1881+
g_cpu_support_arm_edsp = ruapu_supports("edsp");
20551882
g_cpu_support_arm_neon = 1; // all modern windows arm devices have neon
2056-
g_cpu_support_arm_vfpv4 = detectisa(some_vfpv4);
1883+
g_cpu_support_arm_vfpv4 = ruapu_supports("vfpv4");
20571884
#endif // __aarch64__ || __arm__
20581885
#elif defined __ANDROID__ || defined __linux__
20591886
g_hwcaps = get_elf_hwcap(AT_HWCAP);

0 commit comments

Comments
 (0)