@@ -16,6 +16,12 @@ RUNTIME_REQUIRE :: false // !ODIN_TILDE
1616@(private)
1717__float16 :: f16 when __ODIN_LLVM_F16_SUPPORTED else u16
1818
19+ HAS_HARDWARE_SIMD :: false when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature (" sse2" ) else
20+ false when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature (" neon" ) else
21+ false when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature (" simd128" ) else
22+ false when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature (" v" ) else
23+ true
24+
1925
2026@(private)
2127byte_slice :: #force_inline proc " contextless" (data: rawptr , len: int ) -> []byte #no_bounds_check {
@@ -229,150 +235,173 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
229235 case n == 0 : return true
230236 case x == y: return true
231237 }
232- a, b := ([^]byte )(x), ([^]byte )(y)
233- length := uint (n)
234-
235- for i := uint (0 ); i < length; i += 1 {
236- if a[i] != b[i] {
237- return false
238- }
239- }
240- return true
241-
242- /*
243-
244- when size_of(uint) == 8 {
245- if word_length := length >> 3; word_length != 0 {
246- for _ in 0..<word_length {
247- if intrinsics.unaligned_load((^u64)(a)) != intrinsics.unaligned_load((^u64)(b)) {
248- return false
238+ a, b := cast ([^]byte )x, cast ([^]byte )y
239+
240+ n := uint (n)
241+ i := uint (0 )
242+ m := uint (0 )
243+
244+ if n >= 8 {
245+ when HAS_HARDWARE_SIMD {
246+ // Avoid using 256-bit SIMD on platforms where its emulation is
247+ // likely to be less than ideal.
248+ when ODIN_ARCH == .amd64 && intrinsics.has_target_feature (" avx2" ) {
249+ m = n / 32 * 32
250+ for /**/ ; i < m; i += 32 {
251+ load_a := intrinsics.unaligned_load (cast (^#simd [32 ]u8 )&a[i])
252+ load_b := intrinsics.unaligned_load (cast (^#simd [32 ]u8 )&b[i])
253+ ne := intrinsics.simd_lanes_ne (load_a, load_b)
254+ if intrinsics.simd_reduce_or (ne) != 0 {
255+ return false
256+ }
249257 }
250- a = a[size_of(u64):]
251- b = b[size_of(u64):]
252258 }
253259 }
254-
255- if length & 4 != 0 {
256- if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
260+
261+ m = (n-i) / 16 * 16
262+ for /**/ ; i < m; i += 16 {
263+ load_a := intrinsics.unaligned_load (cast (^#simd [16 ]u8 )&a[i])
264+ load_b := intrinsics.unaligned_load (cast (^#simd [16 ]u8 )&b[i])
265+ ne := intrinsics.simd_lanes_ne (load_a, load_b)
266+ if intrinsics.simd_reduce_or (ne) != 0 {
257267 return false
258268 }
259- a = a[size_of(u32):]
260- b = b[size_of(u32):]
261269 }
262-
263- if length & 2 != 0 {
264- if intrinsics.unaligned_load((^u16)(a)) != intrinsics.unaligned_load((^u16)(b)) {
270+
271+ m = (n-i) / 8 * 8
272+ for /**/ ; i < m; i += 8 {
273+ if intrinsics.unaligned_load (cast (^uintptr )&a[i]) != intrinsics.unaligned_load (cast (^uintptr )&b[i]) {
265274 return false
266275 }
267- a = a[size_of(u16):]
268- b = b[size_of(u16):]
269276 }
270-
271- if length & 1 != 0 && a[0] != b[0] {
272- return false
273- }
274- return true
275- } else {
276- if word_length := length >> 2; word_length != 0 {
277- for _ in 0..<word_length {
278- if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
279- return false
280- }
281- a = a[size_of(u32):]
282- b = b[size_of(u32):]
283- }
284- }
285-
286- length &= 3
287-
288- if length != 0 {
289- for i in 0..<length {
290- if a[i] != b[i] {
291- return false
292- }
293- }
294- }
295-
296- return true
297277 }
298- */
299278
279+ for /**/ ; i < n; i += 1 {
280+ if a[i] != b[i] {
281+ return false
282+ }
283+ }
284+ return true
300285}
301- memory_compare :: proc " contextless" (a, b: rawptr , n: int ) -> int #no_bounds_check {
286+
287+ memory_compare :: proc " contextless" (x, y: rawptr , n: int ) -> int #no_bounds_check {
302288 switch {
303- case a == b: return 0
304- case a == nil : return -1
305- case b == nil : return +1
306- }
307-
308- x := uintptr (a)
309- y := uintptr (b)
310- n := uintptr (n)
311-
312- SU :: size_of (uintptr )
313- fast := n/SU + 1
314- offset := (fast-1 )*SU
315- curr_block := uintptr (0 )
316- if n < SU {
317- fast = 0
318- }
319-
320- for /**/ ; curr_block < fast; curr_block += 1 {
321- va := (^uintptr )(x + curr_block * size_of (uintptr ))^
322- vb := (^uintptr )(y + curr_block * size_of (uintptr ))^
323- if va ~ vb != 0 {
324- for pos := curr_block*SU; pos < n; pos += 1 {
325- a := (^byte )(x+pos)^
326- b := (^byte )(y+pos)^
327- if a ~ b != 0 {
328- return -1 if (int (a) - int (b)) < 0 else +1
289+ case x == y: return 0
290+ case x == nil : return -1
291+ case y == nil : return +1
292+ }
293+ a, b := cast ([^]byte )x, cast ([^]byte )y
294+
295+ n := uint (n)
296+ i := uint (0 )
297+ m := uint (0 )
298+
299+ when HAS_HARDWARE_SIMD {
300+ when ODIN_ARCH == .amd64 && intrinsics.has_target_feature (" avx2" ) {
301+ m = n / 32 * 32
302+ for /**/ ; i < m; i += 32 {
303+ load_a := intrinsics.unaligned_load (cast (^#simd [32 ]u8 )&a[i])
304+ load_b := intrinsics.unaligned_load (cast (^#simd [32 ]u8 )&b[i])
305+ comparison := intrinsics.simd_lanes_ne (load_a, load_b)
306+ if intrinsics.simd_reduce_or (comparison) != 0 {
307+ sentinel: #simd [32 ]u8 = u8 (0xFF )
308+ indices := intrinsics.simd_indices (#simd [32 ]u8 )
309+ index_select := intrinsics.simd_select (comparison, indices, sentinel)
310+ index_reduce := cast (uint )intrinsics.simd_reduce_min (index_select)
311+ return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
329312 }
330313 }
331314 }
332315 }
333316
334- for /**/ ; offset < n; offset += 1 {
335- a := (^byte )(x+offset)^
336- b := (^byte )(y+offset)^
337- if a ~ b != 0 {
338- return -1 if (int (a) - int (b)) < 0 else +1
317+ m = (n-i) / 16 * 16
318+ for /**/ ; i < m; i += 16 {
319+ load_a := intrinsics.unaligned_load (cast (^#simd [16 ]u8 )&a[i])
320+ load_b := intrinsics.unaligned_load (cast (^#simd [16 ]u8 )&b[i])
321+ comparison := intrinsics.simd_lanes_ne (load_a, load_b)
322+ if intrinsics.simd_reduce_or (comparison) != 0 {
323+ sentinel: #simd [16 ]u8 = u8 (0xFF )
324+ indices := intrinsics.simd_indices (#simd [16 ]u8 )
325+ index_select := intrinsics.simd_select (comparison, indices, sentinel)
326+ index_reduce := cast (uint )intrinsics.simd_reduce_min (index_select)
327+ return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
339328 }
340329 }
341330
331+ // 64-bit SIMD is faster than using a `uintptr` to detect a difference then
332+ // re-iterating with the byte-by-byte loop, at least on AMD64.
333+ m = (n-i) / 8 * 8
334+ for /**/ ; i < m; i += 8 {
335+ load_a := intrinsics.unaligned_load (cast (^#simd [8 ]u8 )&a[i])
336+ load_b := intrinsics.unaligned_load (cast (^#simd [8 ]u8 )&b[i])
337+ comparison := intrinsics.simd_lanes_ne (load_a, load_b)
338+ if intrinsics.simd_reduce_or (comparison) != 0 {
339+ sentinel: #simd [8 ]u8 = u8 (0xFF )
340+ indices := intrinsics.simd_indices (#simd [8 ]u8 )
341+ index_select := intrinsics.simd_select (comparison, indices, sentinel)
342+ index_reduce := cast (uint )intrinsics.simd_reduce_min (index_select)
343+ return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
344+ }
345+ }
346+
347+ for /**/ ; i < n; i += 1 {
348+ if a[i] ~ b[i] != 0 {
349+ return -1 if int (a[i]) - int (b[i]) < 0 else +1
350+ }
351+ }
342352 return 0
343353}
344354
345355memory_compare_zero :: proc " contextless" (a: rawptr , n: int ) -> int #no_bounds_check {
346- x := uintptr (a)
347- n := uintptr (n)
348-
349- SU :: size_of (uintptr )
350- fast := n/SU + 1
351- offset := (fast-1 )*SU
352- curr_block := uintptr (0 )
353- if n < SU {
354- fast = 0
355- }
356-
357- for /**/ ; curr_block < fast; curr_block += 1 {
358- va := (^uintptr )(x + curr_block * size_of (uintptr ))^
359- if va ~ 0 != 0 {
360- for pos := curr_block*SU; pos < n; pos += 1 {
361- a := (^byte )(x+pos)^
362- if a ~ 0 != 0 {
363- return -1 if int (a) < 0 else +1
356+ n := uint (n)
357+ i := uint (0 )
358+ m := uint (0 )
359+
360+ // Because we're comparing against zero, we never return -1, as that would
361+ // indicate the compared value is less than zero.
362+ //
363+ // Note that a zero return value here means equality.
364+
365+ bytes := ([^]u8 )(a)
366+
367+ if n >= 8 {
368+ when HAS_HARDWARE_SIMD {
369+ when ODIN_ARCH == .amd64 && intrinsics.has_target_feature (" avx2" ) {
370+ scanner32: #simd [32 ]u8
371+ m = n / 32 * 32
372+ for /**/ ; i < m; i += 32 {
373+ load := intrinsics.unaligned_load (cast (^#simd [32 ]u8 )&bytes[i])
374+ ne := intrinsics.simd_lanes_ne (scanner32, load)
375+ if intrinsics.simd_reduce_or (ne) > 0 {
376+ return 1
377+ }
364378 }
365379 }
366380 }
367- }
368381
369- for /**/ ; offset < n; offset += 1 {
370- a := (^byte )(x+offset)^
371- if a ~ 0 != 0 {
372- return -1 if int (a) < 0 else +1
382+ scanner16: #simd [16 ]u8
383+ m = (n-i) / 16 * 16
384+ for /**/ ; i < m; i += 16 {
385+ load := intrinsics.unaligned_load (cast (^#simd [16 ]u8 )&bytes[i])
386+ ne := intrinsics.simd_lanes_ne (scanner16, load)
387+ if intrinsics.simd_reduce_or (ne) != 0 {
388+ return 1
389+ }
390+ }
391+
392+ m = (n-i) / 8 * 8
393+ for /**/ ; i < m; i += 8 {
394+ if intrinsics.unaligned_load (cast (^uintptr )&bytes[i]) != 0 {
395+ return 1
396+ }
373397 }
374398 }
375399
400+ for /**/ ; i < n; i += 1 {
401+ if bytes[i] != 0 {
402+ return 1
403+ }
404+ }
376405 return 0
377406}
378407
0 commit comments