Merge pull request #4063 from Feoramund/simd-memory

Kelimion · web-flow · commit 3142aaf497d7 · 2025-05-29T23:32:19.000+02:00
Vectorize `base:runtime.memory_*`
diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin
@@ -16,6 +16,12 @@ RUNTIME_REQUIRE :: false // !ODIN_TILDE
 @(private)
 __float16 :: f16 when __ODIN_LLVM_F16_SUPPORTED else u16
 
+HAS_HARDWARE_SIMD :: false when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
+	false when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
+	false when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
+	false when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
+	true
+
 
 @(private)
 byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byte #no_bounds_check {
@@ -229,150 +235,173 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool {
 	case n == 0: return true
 	case x == y: return true
 	}
-	a, b := ([^]byte)(x), ([^]byte)(y)
-	length := uint(n)
-
-	for i := uint(0); i < length; i += 1 {
-		if a[i] != b[i] {
-			return false
-		}
-	}
-	return true
-	
-/*
-
-	when size_of(uint) == 8 {
-		if word_length := length >> 3; word_length != 0 {
-			for _ in 0..<word_length {
-				if intrinsics.unaligned_load((^u64)(a)) != intrinsics.unaligned_load((^u64)(b)) {
-					return false
+	a, b := cast([^]byte)x, cast([^]byte)y
+
+	n := uint(n)
+	i := uint(0)
+	m := uint(0)
+
+	if n >= 8 {
+		when HAS_HARDWARE_SIMD {
+			// Avoid using 256-bit SIMD on platforms where its emulation is
+			// likely to be less than ideal.
+			when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+				m = n / 32 * 32
+				for /**/; i < m; i += 32 {
+					load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
+					load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
+					ne := intrinsics.simd_lanes_ne(load_a, load_b)
+					if intrinsics.simd_reduce_or(ne) != 0 {
+						return false
+					}
 				}
-				a = a[size_of(u64):]
-				b = b[size_of(u64):]
 			}
 		}
-		
-		if length & 4 != 0 {
-			if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
+
+		m = (n-i) / 16 * 16
+		for /**/; i < m; i += 16 {
+			load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
+			load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
+			ne := intrinsics.simd_lanes_ne(load_a, load_b)
+			if intrinsics.simd_reduce_or(ne) != 0 {
 				return false
 			}
-			a = a[size_of(u32):]
-			b = b[size_of(u32):]
 		}
-		
-		if length & 2 != 0 {
-			if intrinsics.unaligned_load((^u16)(a)) != intrinsics.unaligned_load((^u16)(b)) {
+
+		m = (n-i) / 8 * 8
+		for /**/; i < m; i += 8 {
+			if intrinsics.unaligned_load(cast(^uintptr)&a[i]) != intrinsics.unaligned_load(cast(^uintptr)&b[i]) {
 				return false
 			}
-			a = a[size_of(u16):]
-			b = b[size_of(u16):]
 		}
-		
-		if length & 1 != 0 && a[0] != b[0] {
-			return false	
-		}
-		return true
-	} else {
-		if word_length := length >> 2; word_length != 0 {
-			for _ in 0..<word_length {
-				if intrinsics.unaligned_load((^u32)(a)) != intrinsics.unaligned_load((^u32)(b)) {
-					return false
-				}
-				a = a[size_of(u32):]
-				b = b[size_of(u32):]
-			}
-		}
-		
-		length &= 3
-		
-		if length != 0 {
-			for i in 0..<length {
-				if a[i] != b[i] {
-					return false
-				}
-			}
-		}
-
-		return true
 	}
-*/
 
+	for /**/; i < n; i += 1 {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
 }
-memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check {
+
+memory_compare :: proc "contextless" (x, y: rawptr, n: int) -> int #no_bounds_check {
 	switch {
-	case a == b:   return 0
-	case a == nil: return -1
-	case b == nil: return +1
-	}
-
-	x := uintptr(a)
-	y := uintptr(b)
-	n := uintptr(n)
-
-	SU :: size_of(uintptr)
-	fast := n/SU + 1
-	offset := (fast-1)*SU
-	curr_block := uintptr(0)
-	if n < SU {
-		fast = 0
-	}
-
-	for /**/; curr_block < fast; curr_block += 1 {
-		va := (^uintptr)(x + curr_block * size_of(uintptr))^
-		vb := (^uintptr)(y + curr_block * size_of(uintptr))^
-		if va ~ vb != 0 {
-			for pos := curr_block*SU; pos < n; pos += 1 {
-				a := (^byte)(x+pos)^
-				b := (^byte)(y+pos)^
-				if a ~ b != 0 {
-					return -1 if (int(a) - int(b)) < 0 else +1
+	case x == y:   return 0
+	case x == nil: return -1
+	case y == nil: return +1
+	}
+	a, b := cast([^]byte)x, cast([^]byte)y
+	
+	n := uint(n)
+	i := uint(0)
+	m := uint(0)
+
+	when HAS_HARDWARE_SIMD {
+		when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+			m = n / 32 * 32
+			for /**/; i < m; i += 32 {
+				load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i])
+				load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i])
+				comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+				if intrinsics.simd_reduce_or(comparison) != 0 {
+					sentinel: #simd[32]u8 = u8(0xFF)
+					indices := intrinsics.simd_indices(#simd[32]u8)
+					index_select := intrinsics.simd_select(comparison, indices, sentinel)
+					index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
+					return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
 				}
 			}
 		}
 	}
 
-	for /**/; offset < n; offset += 1 {
-		a := (^byte)(x+offset)^
-		b := (^byte)(y+offset)^
-		if a ~ b != 0 {
-			return -1 if (int(a) - int(b)) < 0 else +1
+	m = (n-i) / 16 * 16
+	for /**/; i < m; i += 16 {
+		load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i])
+		load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i])
+		comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+		if intrinsics.simd_reduce_or(comparison) != 0 {
+			sentinel: #simd[16]u8 = u8(0xFF)
+			indices := intrinsics.simd_indices(#simd[16]u8)
+			index_select := intrinsics.simd_select(comparison, indices, sentinel)
+			index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
+			return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
 		}
 	}
 
+	// 64-bit SIMD is faster than using a `uintptr` to detect a difference then
+	// re-iterating with the byte-by-byte loop, at least on AMD64.
+	m = (n-i) / 8 * 8
+	for /**/; i < m; i += 8 {
+		load_a := intrinsics.unaligned_load(cast(^#simd[8]u8)&a[i])
+		load_b := intrinsics.unaligned_load(cast(^#simd[8]u8)&b[i])
+		comparison := intrinsics.simd_lanes_ne(load_a, load_b)
+		if intrinsics.simd_reduce_or(comparison) != 0 {
+			sentinel: #simd[8]u8 = u8(0xFF)
+			indices := intrinsics.simd_indices(#simd[8]u8)
+			index_select := intrinsics.simd_select(comparison, indices, sentinel)
+			index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select)
+			return -1 if a[i+index_reduce] < b[i+index_reduce] else +1
+		}
+	}
+
+	for /**/; i < n; i += 1 {
+		if a[i] ~ b[i] != 0 {
+			return -1 if int(a[i]) - int(b[i]) < 0 else +1
+		}
+	}
 	return 0
 }
 
 memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check {
-	x := uintptr(a)
-	n := uintptr(n)
-
-	SU :: size_of(uintptr)
-	fast := n/SU + 1
-	offset := (fast-1)*SU
-	curr_block := uintptr(0)
-	if n < SU {
-		fast = 0
-	}
-
-	for /**/; curr_block < fast; curr_block += 1 {
-		va := (^uintptr)(x + curr_block * size_of(uintptr))^
-		if va ~ 0 != 0 {
-			for pos := curr_block*SU; pos < n; pos += 1 {
-				a := (^byte)(x+pos)^
-				if a ~ 0 != 0 {
-					return -1 if int(a) < 0 else +1
+	n := uint(n)
+	i := uint(0)
+	m := uint(0)
+
+	// Because we're comparing against zero, we never return -1, as that would
+	// indicate the compared value is less than zero.
+	//
+	// Note that a zero return value here means equality.
+
+	bytes := ([^]u8)(a)
+
+	if n >= 8 {
+		when HAS_HARDWARE_SIMD {
+			when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") {
+				scanner32: #simd[32]u8
+				m = n / 32 * 32
+				for /**/; i < m; i += 32 {
+					load := intrinsics.unaligned_load(cast(^#simd[32]u8)&bytes[i])
+					ne := intrinsics.simd_lanes_ne(scanner32, load)
+					if intrinsics.simd_reduce_or(ne) > 0 {
+						return 1
+					}
 				}
 			}
 		}
-	}
 
-	for /**/; offset < n; offset += 1 {
-		a := (^byte)(x+offset)^
-		if a ~ 0 != 0 {
-			return -1 if int(a) < 0 else +1
+		scanner16: #simd[16]u8
+		m = (n-i) / 16 * 16
+		for /**/; i < m; i += 16 {
+			load := intrinsics.unaligned_load(cast(^#simd[16]u8)&bytes[i])
+			ne := intrinsics.simd_lanes_ne(scanner16, load)
+			if intrinsics.simd_reduce_or(ne) != 0 {
+				return 1
+			}
+		}
+
+		m = (n-i) / 8 * 8
+		for /**/; i < m; i += 8 {
+			if intrinsics.unaligned_load(cast(^uintptr)&bytes[i]) != 0 {
+				return 1
+			}
 		}
 	}
 
+	for /**/; i < n; i += 1 {
+		if bytes[i] != 0 {
+			return 1
+		}
+	}
 	return 0
 }
 
diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin
@@ -350,7 +350,7 @@ index_byte :: proc "contextless" (s: []byte, c: byte) -> (index: int) #no_bounds
 	}
 
 	c_vec: simd.u8x16 = c
-	when !simd.IS_EMULATED {
+	when simd.HAS_HARDWARE_SIMD {
 		// Note: While this is something that could also logically take
 		// advantage of AVX512, the various downclocking and power
 		// consumption related woes make premature to have a dedicated
@@ -485,7 +485,7 @@ last_index_byte :: proc "contextless" (s: []byte, c: byte) -> int #no_bounds_che
 	}
 
 	c_vec: simd.u8x16 = c
-	when !simd.IS_EMULATED {
+	when simd.HAS_HARDWARE_SIMD {
 		// Note: While this is something that could also logically take
 		// advantage of AVX512, the various downclocking and power
 		// consumption related woes make premature to have a dedicated
diff --git a/core/crypto/_chacha20/simd128/chacha20_simd128.odin b/core/crypto/_chacha20/simd128/chacha20_simd128.odin
@@ -39,7 +39,7 @@ when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 {
 
 // Some targets lack runtime feature detection, and will flat out refuse
 // to load binaries that have unknown instructions.  This is distinct from
-// `simd.IS_EMULATED` as actually good designs support runtime feature
+// `simd.HAS_HARDWARE_SIMD` as actually good designs support runtime feature
 // detection and that constant establishes a baseline.
 //
 // See:
diff --git a/core/simd/simd.odin b/core/simd/simd.odin
@@ -21,20 +21,17 @@ package simd
 
 import "base:builtin"
 import "base:intrinsics"
+import "base:runtime"
 
 /*
 Check if SIMD is software-emulated on a target platform.
 
-This value is `false`, when the compile-time target has the hardware support for
-at 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
-for 128-bit SIMD, this value is `true`, and all SIMD operations will likely be
+This value is `true`, when the compile-time target has the hardware support for
+at least 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
+for 128-bit SIMD, this value is `false`, and all SIMD operations will likely be
 emulated.
 */
-IS_EMULATED :: true when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else
-	true when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else
-	true when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else
-	true when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else
-	false
+HAS_HARDWARE_SIMD :: runtime.HAS_HARDWARE_SIMD
 
 /*
 Vector of 16 `u8` lanes (128 bits).
diff --git a/tests/benchmark/bytes/benchmark_bytes.odin b/tests/benchmark/bytes/benchmark_bytes.odin
@@ -54,14 +54,15 @@ run_trial_size :: proc(p: proc "contextless" ([]u8, byte) -> int, size: int, idx
 
 	accumulator: int
 
+	watch: time.Stopwatch
+
+	time.stopwatch_start(&watch)
 	for _ in 0..<runs {
-		start := time.now()
 		accumulator += p(data, 'z')
-		done := time.since(start)
-		timing += done
 	}
+	time.stopwatch_stop(&watch)
 
-	timing /= time.Duration(runs)
+	timing = time.stopwatch_duration(watch)
 
 	log.debug(accumulator)
 	return
diff --git a/tests/benchmark/runtime/benchmark_runtime.odin b/tests/benchmark/runtime/benchmark_runtime.odin
diff --git a/tests/core/runtime/test_core_runtime.odin b/tests/core/runtime/test_core_runtime.odin