-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[x32] Fix BLAKE3 assembly #149617
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[x32] Fix BLAKE3 assembly #149617
Conversation
@llvm/pr-subscribers-llvm-support Author: Harald van Dijk (hvdijk) ChangesThe x86-64 assembly implementations of BLAKE3 are used both in 64-bit and in 32-bit pointer mode, but only worked in 64-bit pointer mode. This PR adds support to also allow them to work in 32-bit pointer mode. Full diff: https://github.com/llvm/llvm-project/pull/149617.diff 4 Files Affected:
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
index e98893c7ef8b8..f285fe119f4c1 100644
--- a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
@@ -45,6 +45,10 @@ blake3_hash_many_avx2:
mov rbp, rsp
sub rsp, 680
and rsp, 0xFFFFFFFFFFFFFFC0
+#ifdef _ILP32
+ mov esi, esi
+ mov edx, edx
+#endif
neg r9d
vmovd xmm0, r9d
vpbroadcastd ymm0, xmm0
@@ -77,6 +81,7 @@ blake3_hash_many_avx2:
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
@@ -85,6 +90,16 @@ blake3_hash_many_avx2:
mov r13, qword ptr [rdi+0x28]
mov r14, qword ptr [rdi+0x30]
mov r15, qword ptr [rdi+0x38]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+ mov r12d, dword ptr [rdi+0x10]
+ mov r13d, dword ptr [rdi+0x14]
+ mov r14d, dword ptr [rdi+0x18]
+ mov r15d, dword ptr [rdi+0x1c]
+#endif
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
@@ -1305,7 +1320,11 @@ blake3_hash_many_avx2:
vmovdqa ymm0, ymmword ptr [rsp+0x260]
vpsubd ymm2, ymm0, ymm2
vmovdqa ymmword ptr [rsp+0x260], ymm2
+#ifndef _ILP32
add rdi, 64
+#else
+ add rdi, 32
+#endif
add rbx, 256
mov qword ptr [rbp+0x50], rbx
sub rsi, 8
@@ -1346,10 +1365,17 @@ blake3_hash_many_avx2:
vpblendd ymm15, ymm15, ymm12, 0x44
vmovdqa ymmword ptr [rsp], ymm14
vmovdqa ymmword ptr [rsp+0x20], ymm15
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1557,7 +1583,11 @@ blake3_hash_many_avx2:
vmovaps xmmword ptr [rsp+0x240], xmm0
vmovaps xmmword ptr [rsp+0x260], xmm2
add rbx, 128
+#ifndef _ILP32
add rdi, 32
+#else
+ add rdi, 16
+#endif
sub rsi, 4
3:
test rsi, 0x2
@@ -1573,8 +1603,13 @@ blake3_hash_many_avx2:
vinserti128 ymm13, ymm13, xmm14, 0x01
vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1683,7 +1718,11 @@ blake3_hash_many_avx2:
vmovaps ymmword ptr [rsp+0x240], ymm0
vmovaps ymmword ptr [rsp+0x260], ymm2
add rbx, 64
+#ifndef _ILP32
add rdi, 16
+#else
+ add rdi, 8
+#endif
sub rsi, 2
3:
test rsi, 0x1
@@ -1695,7 +1734,11 @@ blake3_hash_many_avx2:
vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovdqa xmm14, xmmword ptr [ROT16+rip]
vmovdqa xmm15, xmmword ptr [ROT8+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
+#else
+ mov r8d, dword ptr [rdi]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
index b4b14946de10e..709c4752d4084 100644
--- a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
@@ -59,6 +59,10 @@ blake3_hash_many_avx512:
sub rsp, 144
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9
+#ifdef _ILP32
+ mov esi, esi
+ mov edx, edx
+#endif
kmovw k1, r9d
vmovd xmm0, r8d
vpbroadcastd ymm0, xmm0
@@ -107,6 +111,7 @@ blake3_hash_many_avx512:
cmp rdx, qword ptr [rsp+0x80]
cmove eax, ebx
mov dword ptr [rsp+0x88], eax
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
@@ -115,6 +120,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x48]
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+ mov r12d, dword ptr [rdi+0x20]
+ mov r13d, dword ptr [rdi+0x24]
+ mov r14d, dword ptr [rdi+0x28]
+ mov r15d, dword ptr [rdi+0x2c]
+#endif
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
@@ -127,6 +142,7 @@ blake3_hash_many_avx512:
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
vpunpcklqdq zmm10, zmm18, zmm19
vpunpckhqdq zmm11, zmm18, zmm19
+#ifndef _ILP32
mov r8, qword ptr [rdi+0x20]
mov r9, qword ptr [rdi+0x28]
mov r10, qword ptr [rdi+0x30]
@@ -135,6 +151,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x68]
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
+#else
+ mov r8d, dword ptr [rdi+0x10]
+ mov r9d, dword ptr [rdi+0x14]
+ mov r10d, dword ptr [rdi+0x18]
+ mov r11d, dword ptr [rdi+0x1c]
+ mov r12d, dword ptr [rdi+0x30]
+ mov r13d, dword ptr [rdi+0x34]
+ mov r14d, dword ptr [rdi+0x38]
+ mov r15d, dword ptr [rdi+0x3c]
+#endif
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
@@ -169,6 +195,7 @@ blake3_hash_many_avx512:
vmovdqa32 zmm23, zmm19
vpermt2d zmm19, zmm27, zmm8
vpermt2d zmm23, zmm31, zmm8
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
@@ -177,6 +204,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x48]
mov r14, qword ptr [rdi+0x50]
mov r15, qword ptr [rdi+0x58]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+ mov r12d, dword ptr [rdi+0x20]
+ mov r13d, dword ptr [rdi+0x24]
+ mov r14d, dword ptr [rdi+0x28]
+ mov r15d, dword ptr [rdi+0x2c]
+#endif
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
@@ -197,6 +234,7 @@ blake3_hash_many_avx512:
prefetcht0 [r14+rdx+0x80]
prefetcht0 [r11+rdx+0x80]
prefetcht0 [r15+rdx+0x80]
+#ifndef _ILP32
mov r8, qword ptr [rdi+0x20]
mov r9, qword ptr [rdi+0x28]
mov r10, qword ptr [rdi+0x30]
@@ -205,6 +243,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x68]
mov r14, qword ptr [rdi+0x70]
mov r15, qword ptr [rdi+0x78]
+#else
+ mov r8d, dword ptr [rdi+0x10]
+ mov r9d, dword ptr [rdi+0x14]
+ mov r10d, dword ptr [rdi+0x18]
+ mov r11d, dword ptr [rdi+0x1c]
+ mov r12d, dword ptr [rdi+0x30]
+ mov r13d, dword ptr [rdi+0x34]
+ mov r14d, dword ptr [rdi+0x38]
+ mov r15d, dword ptr [rdi+0x3c]
+#endif
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
@@ -1095,7 +1143,11 @@ blake3_hash_many_avx512:
vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
vmovdqa32 zmmword ptr [rsp], zmm2
vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
+#ifndef _ILP32
add rdi, 128
+#else
+ add rdi, 64
+#endif
add rbx, 512
mov qword ptr [rbp+0x50], rbx
sub rsi, 16
@@ -1125,6 +1177,7 @@ blake3_hash_many_avx512:
vpbroadcastd ymm5, dword ptr [rcx+0x14]
vpbroadcastd ymm6, dword ptr [rcx+0x18]
vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
@@ -1133,6 +1186,16 @@ blake3_hash_many_avx512:
mov r13, qword ptr [rdi+0x28]
mov r14, qword ptr [rdi+0x30]
mov r15, qword ptr [rdi+0x38]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+ mov r12d, dword ptr [rdi+0x10]
+ mov r13d, dword ptr [rdi+0x14]
+ mov r14d, dword ptr [rdi+0x18]
+ mov r15d, dword ptr [rdi+0x1c]
+#endif
movzx eax, byte ptr [rbp+0x38]
movzx ebx, byte ptr [rbp+0x40]
or eax, ebx
@@ -2055,7 +2118,11 @@ blake3_hash_many_avx512:
vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
add rbx, 256
mov qword ptr [rbp+0x50], rbx
+#ifndef _ILP32
add rdi, 64
+#else
+ add rdi, 32
+#endif
sub rsi, 8
3:
mov rbx, qword ptr [rbp+0x50]
@@ -2078,10 +2145,17 @@ blake3_hash_many_avx512:
kmovw k2, eax
vpblendmd zmm13 {k2}, zmm13, zmm12
vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+#endif
mov eax, 43690
kmovw k3, eax
mov eax, 34952
@@ -2195,7 +2269,11 @@ blake3_hash_many_avx512:
vmovdqa xmmword ptr [rsp], xmm0
vmovdqa xmmword ptr [rsp+0x40], xmm2
add rbx, 128
+#ifndef _ILP32
add rdi, 32
+#else
+ add rdi, 16
+#endif
sub rsi, 4
3:
test esi, 0x2
@@ -2209,8 +2287,13 @@ blake3_hash_many_avx512:
vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vinserti128 ymm13, ymm13, xmm14, 0x01
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -2308,7 +2391,11 @@ blake3_hash_many_avx512:
vmovdqa xmmword ptr [rsp], xmm0
vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
add rbx, 64
+#ifndef _ILP32
add rdi, 16
+#else
+ add rdi, 8
+#endif
sub rsi, 2
3:
test esi, 0x1
@@ -2319,7 +2406,11 @@ blake3_hash_many_avx512:
vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
+#else
+ mov r8d, dword ptr [rdi]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S
index d69a1706fefe7..85434df927cdd 100644
--- a/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S
@@ -54,6 +54,10 @@ blake3_hash_many_sse2:
sub rsp, 360
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9d
+#ifdef _ILP32
+ mov esi, esi
+ mov edx, edx
+#endif
movd xmm0, r9d
pshufd xmm0, xmm0, 0x00
movdqa xmmword ptr [rsp+0x130], xmm0
@@ -91,10 +95,17 @@ blake3_hash_many_sse2:
pshufd xmm5, xmm7, 0x55
pshufd xmm6, xmm7, 0xAA
pshufd xmm7, xmm7, 0xFF
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1648,7 +1659,11 @@ blake3_hash_many_sse2:
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+0x120], xmm1
add rbx, 128
+#ifndef _ILP32
add rdi, 32
+#else
+ add rdi, 16
+#endif
sub rsi, 4
cmp rsi, 4
jnc 2b
@@ -1679,8 +1694,13 @@ blake3_hash_many_sse2:
movd xmm13, dword ptr [rsp+0x124]
punpckldq xmm14, xmm13
movaps xmmword ptr [rsp+0x10], xmm14
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1909,7 +1929,11 @@ blake3_hash_many_sse2:
mov r11d, dword ptr [rsp+0x120+8*rax]
mov dword ptr [rsp+0x110], r10d
mov dword ptr [rsp+0x120], r11d
+#ifndef _ILP32
add rdi, 16
+#else
+ add rdi, 8
+#endif
add rbx, 64
sub rsi, 2
3:
@@ -1920,7 +1944,11 @@ blake3_hash_many_sse2:
movd xmm13, dword ptr [rsp+0x110]
movd xmm14, dword ptr [rsp+0x120]
punpckldq xmm13, xmm14
+#ifndef _ILP32
mov r8, qword ptr [rdi]
+#else
+ mov r8d, dword ptr [rdi]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S
index c5b103af61c4f..403773421587c 100644
--- a/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S
@@ -54,6 +54,10 @@ blake3_hash_many_sse41:
sub rsp, 360
and rsp, 0xFFFFFFFFFFFFFFC0
neg r9d
+#ifdef _ILP32
+ mov esi, esi
+ mov edx, edx
+#endif
movd xmm0, r9d
pshufd xmm0, xmm0, 0x00
movdqa xmmword ptr [rsp+0x130], xmm0
@@ -91,10 +95,17 @@ blake3_hash_many_sse41:
pshufd xmm5, xmm7, 0x55
pshufd xmm6, xmm7, 0xAA
pshufd xmm7, xmm7, 0xFF
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
mov r10, qword ptr [rdi+0x10]
mov r11, qword ptr [rdi+0x18]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+ mov r10d, dword ptr [rdi+0x8]
+ mov r11d, dword ptr [rdi+0xc]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1452,7 +1463,11 @@ blake3_hash_many_sse41:
psubd xmm1, xmm0
movdqa xmmword ptr [rsp+0x120], xmm1
add rbx, 128
+#ifndef _ILP32
add rdi, 32
+#else
+ add rdi, 16
+#endif
sub rsi, 4
cmp rsi, 4
jnc 2b
@@ -1483,8 +1498,13 @@ blake3_hash_many_sse41:
pinsrd xmm14, dword ptr [rsp+0x124], 1
pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmmword ptr [rsp+0x10], xmm14
+#ifndef _ILP32
mov r8, qword ptr [rdi]
mov r9, qword ptr [rdi+0x8]
+#else
+ mov r8d, dword ptr [rdi]
+ mov r9d, dword ptr [rdi+0x4]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
@@ -1686,7 +1706,11 @@ blake3_hash_many_sse41:
blendvps xmm2, xmm4, xmm0
movdqa xmmword ptr [rsp+0x110], xmm1
movdqa xmmword ptr [rsp+0x120], xmm2
+#ifndef _ILP32
add rdi, 16
+#else
+ add rdi, 8
+#endif
add rbx, 64
sub rsi, 2
3:
@@ -1699,7 +1723,11 @@ blake3_hash_many_sse41:
pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
movaps xmm14, xmmword ptr [ROT8+rip]
movaps xmm15, xmmword ptr [ROT16+rip]
+#ifndef _ILP32
mov r8, qword ptr [rdi]
+#else
+ mov r8d, dword ptr [rdi]
+#endif
movzx eax, byte ptr [rbp+0x40]
or eax, r13d
xor edx, edx
|
The x86-64 assembly implementations of BLAKE3 are used both in 64-bit and in 32-bit pointer mode, but only worked in 64-bit pointer mode. This PR adds support to also allow them to work in 32-bit pointer mode.
The CI failures are unrelated to this PR; I will rebase after #149611 is merged. |
Please also submit this upstream (https://github.com/BLAKE3-team/BLAKE3). |
The x86-64 assembly implementations of BLAKE3 are used both in 64-bit and in 32-bit pointer mode, but only worked in 64-bit pointer mode. This PR adds support to also allow them to work in 32-bit pointer mode.