Skip to content

Commit 3142aaf

Browse files
authored
Merge pull request #4063 from Feoramund/simd-memory
Vectorize `base:runtime.memory_*`
2 parents 0d0f311 + 45219f2 commit 3142aaf

File tree

7 files changed

+457
-127
lines changed

7 files changed

+457
-127
lines changed

base/runtime/internal.odin

Lines changed: 141 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,12 @@ RUNTIME_REQUIRE :: false // !ODIN_TILDE
1616
@(private)
1717
__float16 :: f16 when __ODIN_LLVM_F16_SUPPORTED else u16
1818

19+
HAS_HARDWARE_SIMD :: false when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
20+
false when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
21+
false when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
22+
false when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
23+
true
24+
1925

2026
@(private)
2127
byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byte #no_bounds_check {
@@ -229,150 +235,173 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
229235
case n == 0: return true
230236
case x == y: return true
231237
}
232-
a, b := ([^]byte)(x), ([^]byte)(y)
233-
length := uint(n)
234-
235-
for i := uint(0); i < length; i += 1 {
236-
if a[i] != b[i] {
237-
return false
238-
}
239-
}
240-
return true
241-
242-
/*
243-
244-
when size_of(uint) == 8 {
245-
if word_length := length >> 3; word_length != 0 {
246-
for _ in 0..<word_length {
247-
if intrinsics.unaligned_load((^u64)(a)) != intrinsics.unaligned_load((^u64)(b)) {
248-
return false
238+
a, b := cast([^]byte)x, cast([^]byte)y
239+
240+
n := uint(n)
241+
i := uint(0)
242+
m := uint(0)
243+
244+
if n >= 8 {
245+
when HAS_HARDWARE_SIMD {
246+
// Avoid using 256-bit SIMD on platforms where its emulation is
247+
// likely to be less than ideal.
248+
when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
249+
m = n / 32 * 32
250+
for /**/; i < m; i += 32 {
251+
load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
252+
load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
253+
ne := intrinsics.simd_lanes_ne(load_a, load_b)
254+
if intrinsics.simd_reduce_or(ne) != 0 {
255+
return false
256+
}
249257
}
250-
a = a[size_of(u64):]
251-
b = b[size_of(u64):]
252258
}
253259
}
254-
255-
if length & 4 != 0 {
256-
if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
260+
261+
m = (n-i) / 16 * 16
262+
for /**/; i < m; i += 16 {
263+
load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
264+
load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
265+
ne := intrinsics.simd_lanes_ne(load_a, load_b)
266+
if intrinsics.simd_reduce_or(ne) != 0 {
257267
return false
258268
}
259-
a = a[size_of(u32):]
260-
b = b[size_of(u32):]
261269
}
262-
263-
if length & 2 != 0 {
264-
if intrinsics.unaligned_load((^u16)(a)) != intrinsics.unaligned_load((^u16)(b)) {
270+
271+
m = (n-i) / 8 * 8
272+
for /**/; i < m; i += 8 {
273+
if intrinsics.unaligned_load(cast(^uintptr)&a[i]) != intrinsics.unaligned_load(cast(^uintptr)&b[i]) {
265274
return false
266275
}
267-
a = a[size_of(u16):]
268-
b = b[size_of(u16):]
269276
}
270-
271-
if length & 1 != 0 && a[0] != b[0] {
272-
return false
273-
}
274-
return true
275-
} else {
276-
if word_length := length >> 2; word_length != 0 {
277-
for _ in 0..<word_length {
278-
if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
279-
return false
280-
}
281-
a = a[size_of(u32):]
282-
b = b[size_of(u32):]
283-
}
284-
}
285-
286-
length &= 3
287-
288-
if length != 0 {
289-
for i in 0..<length {
290-
if a[i] != b[i] {
291-
return false
292-
}
293-
}
294-
}
295-
296-
return true
297277
}
298-
*/
299278

279+
for /**/; i < n; i += 1 {
280+
if a[i] != b[i] {
281+
return false
282+
}
283+
}
284+
return true
300285
}
301-
memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check {
286+
287+
memory_compare :: proc "contextless" (x, y: rawptr, n: int) -> int #no_bounds_check {
302288
switch {
303-
case a == b: return 0
304-
case a == nil: return -1
305-
case b == nil: return +1
306-
}
307-
308-
x := uintptr(a)
309-
y := uintptr(b)
310-
n := uintptr(n)
311-
312-
SU :: size_of(uintptr)
313-
fast := n/SU + 1
314-
offset := (fast-1)*SU
315-
curr_block := uintptr(0)
316-
if n < SU {
317-
fast = 0
318-
}
319-
320-
for /**/; curr_block < fast; curr_block += 1 {
321-
va := (^uintptr)(x + curr_block * size_of(uintptr))^
322-
vb := (^uintptr)(y + curr_block * size_of(uintptr))^
323-
if va ~ vb != 0 {
324-
for pos := curr_block*SU; pos < n; pos += 1 {
325-
a := (^byte)(x+pos)^
326-
b := (^byte)(y+pos)^
327-
if a ~ b != 0 {
328-
return -1 if (int(a) - int(b)) < 0 else +1
289+
case x == y: return 0
290+
case x == nil: return -1
291+
case y == nil: return +1
292+
}
293+
a, b := cast([^]byte)x, cast([^]byte)y
294+
295+
n := uint(n)
296+
i := uint(0)
297+
m := uint(0)
298+
299+
when HAS_HARDWARE_SIMD {
300+
when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
301+
m = n / 32 * 32
302+
for /**/; i < m; i += 32 {
303+
load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
304+
load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
305+
comparison := intrinsics.simd_lanes_ne(load_a, load_b)
306+
if intrinsics.simd_reduce_or(comparison) != 0 {
307+
sentinel: #simd[32]u8 = u8(0xFF)
308+
indices := intrinsics.simd_indices(#simd[32]u8)
309+
index_select := intrinsics.simd_select(comparison, indices, sentinel)
310+
index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
311+
return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
329312
}
330313
}
331314
}
332315
}
333316

334-
for /**/; offset < n; offset += 1 {
335-
a := (^byte)(x+offset)^
336-
b := (^byte)(y+offset)^
337-
if a ~ b != 0 {
338-
return -1 if (int(a) - int(b)) < 0 else +1
317+
m = (n-i) / 16 * 16
318+
for /**/; i < m; i += 16 {
319+
load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
320+
load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
321+
comparison := intrinsics.simd_lanes_ne(load_a, load_b)
322+
if intrinsics.simd_reduce_or(comparison) != 0 {
323+
sentinel: #simd[16]u8 = u8(0xFF)
324+
indices := intrinsics.simd_indices(#simd[16]u8)
325+
index_select := intrinsics.simd_select(comparison, indices, sentinel)
326+
index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
327+
return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
339328
}
340329
}
341330

331+
// 64-bit SIMD is faster than using a `uintptr` to detect a difference then
332+
// re-iterating with the byte-by-byte loop, at least on AMD64.
333+
m = (n-i) / 8 * 8
334+
for /**/; i < m; i += 8 {
335+
load_a := intrinsics.unaligned_load(cast(^#simd[8]u8)&a[i])
336+
load_b := intrinsics.unaligned_load(cast(^#simd[8]u8)&b[i])
337+
comparison := intrinsics.simd_lanes_ne(load_a, load_b)
338+
if intrinsics.simd_reduce_or(comparison) != 0 {
339+
sentinel: #simd[8]u8 = u8(0xFF)
340+
indices := intrinsics.simd_indices(#simd[8]u8)
341+
index_select := intrinsics.simd_select(comparison, indices, sentinel)
342+
index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
343+
return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
344+
}
345+
}
346+
347+
for /**/; i < n; i += 1 {
348+
if a[i] ~ b[i] != 0 {
349+
return -1 if int(a[i]) - int(b[i]) < 0 else +1
350+
}
351+
}
342352
return 0
343353
}
344354

345355
memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check {
346-
x := uintptr(a)
347-
n := uintptr(n)
348-
349-
SU :: size_of(uintptr)
350-
fast := n/SU + 1
351-
offset := (fast-1)*SU
352-
curr_block := uintptr(0)
353-
if n < SU {
354-
fast = 0
355-
}
356-
357-
for /**/; curr_block < fast; curr_block += 1 {
358-
va := (^uintptr)(x + curr_block * size_of(uintptr))^
359-
if va ~ 0 != 0 {
360-
for pos := curr_block*SU; pos < n; pos += 1 {
361-
a := (^byte)(x+pos)^
362-
if a ~ 0 != 0 {
363-
return -1 if int(a) < 0 else +1
356+
n := uint(n)
357+
i := uint(0)
358+
m := uint(0)
359+
360+
// Because we're comparing against zero, we never return -1, as that would
361+
// indicate the compared value is less than zero.
362+
//
363+
// Note that a zero return value here means equality.
364+
365+
bytes := ([^]u8)(a)
366+
367+
if n >= 8 {
368+
when HAS_HARDWARE_SIMD {
369+
when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
370+
scanner32: #simd[32]u8
371+
m = n / 32 * 32
372+
for /**/; i < m; i += 32 {
373+
load := intrinsics.unaligned_load(cast(^#simd[32]u8)&bytes[i])
374+
ne := intrinsics.simd_lanes_ne(scanner32, load)
375+
if intrinsics.simd_reduce_or(ne) > 0 {
376+
return 1
377+
}
364378
}
365379
}
366380
}
367-
}
368381

369-
for /**/; offset < n; offset += 1 {
370-
a := (^byte)(x+offset)^
371-
if a ~ 0 != 0 {
372-
return -1 if int(a) < 0 else +1
382+
scanner16: #simd[16]u8
383+
m = (n-i) / 16 * 16
384+
for /**/; i < m; i += 16 {
385+
load := intrinsics.unaligned_load(cast(^#simd[16]u8)&bytes[i])
386+
ne := intrinsics.simd_lanes_ne(scanner16, load)
387+
if intrinsics.simd_reduce_or(ne) != 0 {
388+
return 1
389+
}
390+
}
391+
392+
m = (n-i) / 8 * 8
393+
for /**/; i < m; i += 8 {
394+
if intrinsics.unaligned_load(cast(^uintptr)&bytes[i]) != 0 {
395+
return 1
396+
}
373397
}
374398
}
375399

400+
for /**/; i < n; i += 1 {
401+
if bytes[i] != 0 {
402+
return 1
403+
}
404+
}
376405
return 0
377406
}
378407

core/bytes/bytes.odin

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ index_byte :: proc "contextless" (s: []byte, c: byte) -> (index: int) #no_bounds
350350
}
351351

352352
c_vec: simd.u8x16 = c
353-
when !simd.IS_EMULATED {
353+
when simd.HAS_HARDWARE_SIMD {
354354
// Note: While this is something that could also logically take
355355
// advantage of AVX512, the various downclocking and power
356356
// consumption related woes make premature to have a dedicated
@@ -485,7 +485,7 @@ last_index_byte :: proc "contextless" (s: []byte, c: byte) -> int #no_bounds_che
485485
}
486486

487487
c_vec: simd.u8x16 = c
488-
when !simd.IS_EMULATED {
488+
when simd.HAS_HARDWARE_SIMD {
489489
// Note: While this is something that could also logically take
490490
// advantage of AVX512, the various downclocking and power
491491
// consumption related woes make premature to have a dedicated

core/crypto/_chacha20/simd128/chacha20_simd128.odin

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
3939

4040
// Some targets lack runtime feature detection, and will flat out refuse
4141
// to load binaries that have unknown instructions. This is distinct from
42-
// `simd.IS_EMULATED` as actually good designs support runtime feature
42+
// `simd.HAS_HARDWARE_SIMD` as actually good designs support runtime feature
4343
// detection and that constant establishes a baseline.
4444
//
4545
// See:

core/simd/simd.odin

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,20 +21,17 @@ package simd
2121

2222
import "base:builtin"
2323
import "base:intrinsics"
24+
import "base:runtime"
2425

2526
/*
2627
Check if SIMD is software-emulated on a target platform.
2728
28-
This value is `false`, when the compile-time target has the hardware support for
29-
at 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
30-
for 128-bit SIMD, this value is `true`, and all SIMD operations will likely be
29+
This value is `true`, when the compile-time target has the hardware support for
30+
at least 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
31+
for 128-bit SIMD, this value is `false`, and all SIMD operations will likely be
3132
emulated.
3233
*/
33-
IS_EMULATED :: true when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
34-
true when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
35-
true when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
36-
true when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
37-
false
34+
HAS_HARDWARE_SIMD :: runtime.HAS_HARDWARE_SIMD
3835

3936
/*
4037
Vector of 16 `u8` lanes (128 bits).

tests/benchmark/bytes/benchmark_bytes.odin

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,15 @@ run_trial_size :: proc(p: proc "contextless" ([]u8, byte) -> int, size: int, idx
5454

5555
accumulator: int
5656

57+
watch: time.Stopwatch
58+
59+
time.stopwatch_start(&watch)
5760
for _ in 0..<runs {
58-
start := time.now()
5961
accumulator += p(data, 'z')
60-
done := time.since(start)
61-
timing += done
6262
}
63+
time.stopwatch_stop(&watch)
6364

64-
timing /= time.Duration(runs)
65+
timing = time.stopwatch_duration(watch)
6566

6667
log.debug(accumulator)
6768
return

0 commit comments

Comments
 (0)