From b5bc0fdcda8285656562d701fae0e4d6f92ae183 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Tue, 27 May 2025 15:25:30 -0400 Subject: [PATCH 1/7] Remove commented block of code --- base/runtime/internal.odin | 59 -------------------------------------- 1 file changed, 59 deletions(-) diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin index 38b7f662cad..82ae6ffa412 100644 --- a/base/runtime/internal.odin +++ b/base/runtime/internal.odin @@ -238,65 +238,6 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool { } } return true - -/* - - when size_of(uint) == 8 { - if word_length := length >> 3; word_length != 0 { - for _ in 0..> 2; word_length != 0 { - for _ in 0.. int #no_bounds_check { switch { From 827a6f90454cc7540bb3a809657b8d4162545f3c Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Thu, 29 May 2025 15:11:12 -0400 Subject: [PATCH 2/7] Move `simd.IS_EMULATED` to `runtime.SIMD_IS_EMULATED` --- base/runtime/internal.odin | 5 +++++ core/simd/simd.odin | 7 ++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin index 82ae6ffa412..bddbcaa228f 100644 --- a/base/runtime/internal.odin +++ b/base/runtime/internal.odin @@ -16,6 +16,11 @@ RUNTIME_REQUIRE :: false // !ODIN_TILDE @(private) __float16 :: f16 when __ODIN_LLVM_F16_SUPPORTED else u16 +SIMD_IS_EMULATED :: true when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else + true when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else + true when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else + true when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else + false @(private) byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byte #no_bounds_check { diff --git a/core/simd/simd.odin b/core/simd/simd.odin index a97155f58e3..c6c1e10a0e8 100644 --- a/core/simd/simd.odin +++ b/core/simd/simd.odin @@ -21,6 +21,7 @@ package simd import "base:builtin" import "base:intrinsics" +import "base:runtime" /* Check if SIMD is software-emulated on a target platform. @@ -30,11 +31,7 @@ at 128-bit (or wider) SIMD. If the compile-time target lacks the hardware suppor for 128-bit SIMD, this value is `true`, and all SIMD operations will likely be emulated. */ -IS_EMULATED :: true when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else - true when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else - true when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else - true when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else - false +IS_EMULATED :: runtime.SIMD_IS_EMULATED /* Vector of 16 `u8` lanes (128 bits). From 34698288b812147202cd30cc357b47f306cc8f41 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Thu, 29 May 2025 15:32:58 -0400 Subject: [PATCH 3/7] Vectorize `runtime.memory_*` comparison procedures --- base/runtime/internal.odin | 198 ++++++++++++++++++++++++++----------- 1 file changed, 140 insertions(+), 58 deletions(-) diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin index bddbcaa228f..f51d01a9d83 100644 --- a/base/runtime/internal.odin +++ b/base/runtime/internal.odin @@ -234,91 +234,173 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool { case n == 0: return true case x == y: return true } - a, b := ([^]byte)(x), ([^]byte)(y) - length := uint(n) + a, b := cast([^]byte)x, cast([^]byte)y + + n := uint(n) + i := uint(0) + m := uint(0) + + if n >= 8 { + when !SIMD_IS_EMULATED { + // Avoid using 256-bit SIMD on platforms where its emulation is + // likely to be less than ideal. + when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") { + m = n / 32 * 32 + for /**/; i < m; i += 32 { + load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i]) + load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i]) + ne := intrinsics.simd_lanes_ne(load_a, load_b) + if intrinsics.simd_reduce_or(ne) != 0 { + return false + } + } + } + } + + m = (n-i) / 16 * 16 + for /**/; i < m; i += 16 { + load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i]) + load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i]) + ne := intrinsics.simd_lanes_ne(load_a, load_b) + if intrinsics.simd_reduce_or(ne) != 0 { + return false + } + } - for i := uint(0); i < length; i += 1 { + m = (n-i) / 8 * 8 + for /**/; i < m; i += 8 { + if intrinsics.unaligned_load(cast(^uintptr)&a[i]) != intrinsics.unaligned_load(cast(^uintptr)&b[i]) { + return false + } + } + } + + for /**/; i < n; i += 1 { if a[i] != b[i] { return false } } return true } -memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check { + +memory_compare :: proc "contextless" (x, y: rawptr, n: int) -> int #no_bounds_check { switch { - case a == b: return 0 - case a == nil: return -1 - case b == nil: return +1 - } - - x := uintptr(a) - y := uintptr(b) - n := uintptr(n) - - SU :: size_of(uintptr) - fast := n/SU + 1 - offset := (fast-1)*SU - curr_block := uintptr(0) - if n < SU { - fast = 0 - } - - for /**/; curr_block < fast; curr_block += 1 { - va := (^uintptr)(x + curr_block * size_of(uintptr))^ - vb := (^uintptr)(y + curr_block * size_of(uintptr))^ - if va ~ vb != 0 { - for pos := curr_block*SU; pos < n; pos += 1 { - a := (^byte)(x+pos)^ - b := (^byte)(y+pos)^ - if a ~ b != 0 { - return -1 if (int(a) - int(b)) < 0 else +1 + case x == y: return 0 + case x == nil: return -1 + case y == nil: return +1 + } + a, b := cast([^]byte)x, cast([^]byte)y + + n := uint(n) + i := uint(0) + m := uint(0) + + when !SIMD_IS_EMULATED { + when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") { + m = n / 32 * 32 + for /**/; i < m; i += 32 { + load_a := intrinsics.unaligned_load(cast(^#simd[32]u8)&a[i]) + load_b := intrinsics.unaligned_load(cast(^#simd[32]u8)&b[i]) + comparison := intrinsics.simd_lanes_ne(load_a, load_b) + if intrinsics.simd_reduce_or(comparison) != 0 { + sentinel: #simd[32]u8 = u8(0xFF) + indices := intrinsics.simd_indices(#simd[32]u8) + index_select := intrinsics.simd_select(comparison, indices, sentinel) + index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select) + return -1 if a[i+index_reduce] < b[i+index_reduce] else +1 } } } } - for /**/; offset < n; offset += 1 { - a := (^byte)(x+offset)^ - b := (^byte)(y+offset)^ - if a ~ b != 0 { - return -1 if (int(a) - int(b)) < 0 else +1 + m = (n-i) / 16 * 16 + for /**/; i < m; i += 16 { + load_a := intrinsics.unaligned_load(cast(^#simd[16]u8)&a[i]) + load_b := intrinsics.unaligned_load(cast(^#simd[16]u8)&b[i]) + comparison := intrinsics.simd_lanes_ne(load_a, load_b) + if intrinsics.simd_reduce_or(comparison) != 0 { + sentinel: #simd[16]u8 = u8(0xFF) + indices := intrinsics.simd_indices(#simd[16]u8) + index_select := intrinsics.simd_select(comparison, indices, sentinel) + index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select) + return -1 if a[i+index_reduce] < b[i+index_reduce] else +1 } } + // 64-bit SIMD is faster than using a `uintptr` to detect a difference then + // re-iterating with the byte-by-byte loop, at least on AMD64. + m = (n-i) / 8 * 8 + for /**/; i < m; i += 8 { + load_a := intrinsics.unaligned_load(cast(^#simd[8]u8)&a[i]) + load_b := intrinsics.unaligned_load(cast(^#simd[8]u8)&b[i]) + comparison := intrinsics.simd_lanes_ne(load_a, load_b) + if intrinsics.simd_reduce_or(comparison) != 0 { + sentinel: #simd[8]u8 = u8(0xFF) + indices := intrinsics.simd_indices(#simd[8]u8) + index_select := intrinsics.simd_select(comparison, indices, sentinel) + index_reduce := cast(uint)intrinsics.simd_reduce_min(index_select) + return -1 if a[i+index_reduce] < b[i+index_reduce] else +1 + } + } + + for /**/; i < n; i += 1 { + if a[i] ~ b[i] != 0 { + return -1 if int(a[i]) - int(b[i]) < 0 else +1 + } + } return 0 } memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check { - x := uintptr(a) - n := uintptr(n) - - SU :: size_of(uintptr) - fast := n/SU + 1 - offset := (fast-1)*SU - curr_block := uintptr(0) - if n < SU { - fast = 0 - } - - for /**/; curr_block < fast; curr_block += 1 { - va := (^uintptr)(x + curr_block * size_of(uintptr))^ - if va ~ 0 != 0 { - for pos := curr_block*SU; pos < n; pos += 1 { - a := (^byte)(x+pos)^ - if a ~ 0 != 0 { - return -1 if int(a) < 0 else +1 + n := uint(n) + i := uint(0) + m := uint(0) + + // Because we're comparing against zero, we never return -1, as that would + // indicate the compared value is less than zero. + // + // Note that a zero return value here means equality. + + bytes := ([^]u8)(a) + + if n >= 8 { + when !SIMD_IS_EMULATED { + when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") { + scanner32: #simd[32]u8 + m = n / 32 * 32 + for /**/; i < m; i += 32 { + load := intrinsics.unaligned_load(cast(^#simd[32]u8)&bytes[i]) + ne := intrinsics.simd_lanes_ne(scanner32, load) + if intrinsics.simd_reduce_or(ne) > 0 { + return 1 + } } } } - } - for /**/; offset < n; offset += 1 { - a := (^byte)(x+offset)^ - if a ~ 0 != 0 { - return -1 if int(a) < 0 else +1 + scanner16: #simd[16]u8 + m = (n-i) / 16 * 16 + for /**/; i < m; i += 16 { + load := intrinsics.unaligned_load(cast(^#simd[16]u8)&bytes[i]) + ne := intrinsics.simd_lanes_ne(scanner16, load) + if intrinsics.simd_reduce_or(ne) != 0 { + return 1 + } + } + + m = (n-i) / 8 * 8 + for /**/; i < m; i += 8 { + if intrinsics.unaligned_load(cast(^uintptr)&bytes[i]) != 0 { + return 1 + } } } + for /**/; i < n; i += 1 { + if bytes[i] != 0 { + return 1 + } + } return 0 } From b15a665898e72433d8b1486819cd57cc2a9b5f71 Mon Sep 17 00:00:00 2001 From: Feoramund <161657516+Feoramund@users.noreply.github.com> Date: Thu, 29 May 2025 16:04:56 -0400 Subject: [PATCH 4/7] Add tests for `runtime.memory_*` comparison procedures --- tests/core/runtime/test_core_runtime.odin | 76 +++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/tests/core/runtime/test_core_runtime.odin b/tests/core/runtime/test_core_runtime.odin index 472a5527d8d..6bbb9fb8a1a 100644 --- a/tests/core/runtime/test_core_runtime.odin +++ b/tests/core/runtime/test_core_runtime.odin @@ -4,6 +4,7 @@ package test_core_runtime import "base:intrinsics" import "core:mem" import "base:runtime" +import "core:slice" import "core:testing" // Tests that having space for the allocation, but not for the allocation and alignment @@ -177,3 +178,78 @@ test_map_get :: proc(t: ^testing.T) { check(t, m) } } + +@(test) +test_memory_equal :: proc(t: ^testing.T) { + data: [256]u8 + cmp: [256]u8 + + slice.fill(data[:], 0xAA) + slice.fill(cmp[:], 0xAA) + + for offset in 0.. Date: Thu, 29 May 2025 16:05:29 -0400 Subject: [PATCH 5/7] Add benchmarks for `runtime.memory_*` comparison procedures --- .../benchmark/runtime/benchmark_runtime.odin | 227 ++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 tests/benchmark/runtime/benchmark_runtime.odin diff --git a/tests/benchmark/runtime/benchmark_runtime.odin b/tests/benchmark/runtime/benchmark_runtime.odin new file mode 100644 index 00000000000..871fb05e617 --- /dev/null +++ b/tests/benchmark/runtime/benchmark_runtime.odin @@ -0,0 +1,227 @@ +package benchmark_runtime + +import "base:runtime" +import "core:fmt" +import "core:log" +import "core:testing" +import "core:strings" +import "core:text/table" +import "core:time" + +RUNS_PER_SIZE :: 2500 + +sizes := [?]int { + 7, 8, 9, + 15, 16, 17, + 31, 32, 33, + 63, 64, 65, + 95, 96, 97, + 128, + 256, + 512, + 1024, + 4096, + 1024 * 1024, +} + +// These are the normal, unoptimized algorithms. + +plain_memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool { + switch { + case n == 0: return true + case x == y: return true + } + a, b := ([^]byte)(x), ([^]byte)(y) + length := uint(n) + + for i := uint(0); i < length; i += 1 { + if a[i] != b[i] { + return false + } + } + return true +} + +plain_memory_compare :: proc "contextless" (a, b: rawptr, n: int) -> int #no_bounds_check { + switch { + case a == b: return 0 + case a == nil: return -1 + case b == nil: return +1 + } + + x := uintptr(a) + y := uintptr(b) + n := uintptr(n) + + SU :: size_of(uintptr) + fast := n/SU + 1 + offset := (fast-1)*SU + curr_block := uintptr(0) + if n < SU { + fast = 0 + } + + for /**/; curr_block < fast; curr_block += 1 { + va := (^uintptr)(x + curr_block * size_of(uintptr))^ + vb := (^uintptr)(y + curr_block * size_of(uintptr))^ + if va ~ vb != 0 { + for pos := curr_block*SU; pos < n; pos += 1 { + a := (^byte)(x+pos)^ + b := (^byte)(y+pos)^ + if a ~ b != 0 { + return -1 if (int(a) - int(b)) < 0 else +1 + } + } + } + } + + for /**/; offset < n; offset += 1 { + a := (^byte)(x+offset)^ + b := (^byte)(y+offset)^ + if a ~ b != 0 { + return -1 if (int(a) - int(b)) < 0 else +1 + } + } + + return 0 +} + +plain_memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_check { + x := uintptr(a) + n := uintptr(n) + + SU :: size_of(uintptr) + fast := n/SU + 1 + offset := (fast-1)*SU + curr_block := uintptr(0) + if n < SU { + fast = 0 + } + + for /**/; curr_block < fast; curr_block += 1 { + va := (^uintptr)(x + curr_block * size_of(uintptr))^ + if va ~ 0 != 0 { + for pos := curr_block*SU; pos < n; pos += 1 { + a := (^byte)(x+pos)^ + if a ~ 0 != 0 { + return -1 if int(a) < 0 else +1 + } + } + } + } + + for /**/; offset < n; offset += 1 { + a := (^byte)(x+offset)^ + if a ~ 0 != 0 { + return -1 if int(a) < 0 else +1 + } + } + + return 0 +} + +run_trial_size_cmp :: proc(p: proc "contextless" (rawptr, rawptr, int) -> $R, size: int, idx: int, runs: int, loc := #caller_location) -> (timing: time.Duration) { + left := make([]u8, size) + right := make([]u8, size) + defer { + delete(left) + delete(right) + } + + right[idx] = 0x01 + + accumulator: int + + watch: time.Stopwatch + + time.stopwatch_start(&watch) + for _ in 0.. int, size: int, idx: int, runs: int, loc := #caller_location) -> (timing: time.Duration) { + data := make([]u8, size) + defer delete(data) + + data[idx] = 0x01 + + accumulator: int + + watch: time.Stopwatch + + time.stopwatch_start(&watch) + for _ in 0.. Date: Thu, 29 May 2025 16:08:39 -0400 Subject: [PATCH 6/7] Use `time.Stopwatch` in `core:bytes` benchmark This should result in a clearer idea of how fast the procedures are running, as the loop can run without going back and forth to the system for the time. --- tests/benchmark/bytes/benchmark_bytes.odin | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/benchmark/bytes/benchmark_bytes.odin b/tests/benchmark/bytes/benchmark_bytes.odin index 13ef8f9a53a..ee3a91d6405 100644 --- a/tests/benchmark/bytes/benchmark_bytes.odin +++ b/tests/benchmark/bytes/benchmark_bytes.odin @@ -54,14 +54,15 @@ run_trial_size :: proc(p: proc "contextless" ([]u8, byte) -> int, size: int, idx accumulator: int + watch: time.Stopwatch + + time.stopwatch_start(&watch) for _ in 0.. Date: Thu, 29 May 2025 17:17:51 -0400 Subject: [PATCH 7/7] Rename `SIMD_IS_EMULATED` to capability-affirmative `HAS_HARDWARE_SIMD` --- base/runtime/internal.odin | 17 +++++++++-------- core/bytes/bytes.odin | 4 ++-- .../_chacha20/simd128/chacha20_simd128.odin | 2 +- core/simd/simd.odin | 8 ++++---- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/base/runtime/internal.odin b/base/runtime/internal.odin index f51d01a9d83..a35dbff8a4f 100644 --- a/base/runtime/internal.odin +++ b/base/runtime/internal.odin @@ -16,11 +16,12 @@ RUNTIME_REQUIRE :: false // !ODIN_TILDE @(private) __float16 :: f16 when __ODIN_LLVM_F16_SUPPORTED else u16 -SIMD_IS_EMULATED :: true when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else - true when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else - true when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else - true when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else - false +HAS_HARDWARE_SIMD :: false when (ODIN_ARCH == .amd64 || ODIN_ARCH == .i386) && !intrinsics.has_target_feature("sse2") else + false when (ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32) && !intrinsics.has_target_feature("neon") else + false when (ODIN_ARCH == .wasm64p32 || ODIN_ARCH == .wasm32) && !intrinsics.has_target_feature("simd128") else + false when (ODIN_ARCH == .riscv64) && !intrinsics.has_target_feature("v") else + true + @(private) byte_slice :: #force_inline proc "contextless" (data: rawptr, len: int) -> []byte #no_bounds_check { @@ -241,7 +242,7 @@ memory_equal :: proc "contextless" (x, y: rawptr, n: int) -> bool { m := uint(0) if n >= 8 { - when !SIMD_IS_EMULATED { + when HAS_HARDWARE_SIMD { // Avoid using 256-bit SIMD on platforms where its emulation is // likely to be less than ideal. when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") { @@ -295,7 +296,7 @@ memory_compare :: proc "contextless" (x, y: rawptr, n: int) -> int #no_bounds_ch i := uint(0) m := uint(0) - when !SIMD_IS_EMULATED { + when HAS_HARDWARE_SIMD { when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") { m = n / 32 * 32 for /**/; i < m; i += 32 { @@ -364,7 +365,7 @@ memory_compare_zero :: proc "contextless" (a: rawptr, n: int) -> int #no_bounds_ bytes := ([^]u8)(a) if n >= 8 { - when !SIMD_IS_EMULATED { + when HAS_HARDWARE_SIMD { when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") { scanner32: #simd[32]u8 m = n / 32 * 32 diff --git a/core/bytes/bytes.odin b/core/bytes/bytes.odin index c0d25bcce42..71b6ef70c41 100644 --- a/core/bytes/bytes.odin +++ b/core/bytes/bytes.odin @@ -350,7 +350,7 @@ index_byte :: proc "contextless" (s: []byte, c: byte) -> (index: int) #no_bounds } c_vec: simd.u8x16 = c - when !simd.IS_EMULATED { + when simd.HAS_HARDWARE_SIMD { // Note: While this is something that could also logically take // advantage of AVX512, the various downclocking and power // consumption related woes make premature to have a dedicated @@ -485,7 +485,7 @@ last_index_byte :: proc "contextless" (s: []byte, c: byte) -> int #no_bounds_che } c_vec: simd.u8x16 = c - when !simd.IS_EMULATED { + when simd.HAS_HARDWARE_SIMD { // Note: While this is something that could also logically take // advantage of AVX512, the various downclocking and power // consumption related woes make premature to have a dedicated diff --git a/core/crypto/_chacha20/simd128/chacha20_simd128.odin b/core/crypto/_chacha20/simd128/chacha20_simd128.odin index 6b37b8d6141..4bf40e240b6 100644 --- a/core/crypto/_chacha20/simd128/chacha20_simd128.odin +++ b/core/crypto/_chacha20/simd128/chacha20_simd128.odin @@ -39,7 +39,7 @@ when ODIN_ARCH == .arm64 || ODIN_ARCH == .arm32 { // Some targets lack runtime feature detection, and will flat out refuse // to load binaries that have unknown instructions. This is distinct from -// `simd.IS_EMULATED` as actually good designs support runtime feature +// `simd.HAS_HARDWARE_SIMD` as actually good designs support runtime feature // detection and that constant establishes a baseline. // // See: diff --git a/core/simd/simd.odin b/core/simd/simd.odin index c6c1e10a0e8..b4779b5ff6b 100644 --- a/core/simd/simd.odin +++ b/core/simd/simd.odin @@ -26,12 +26,12 @@ import "base:runtime" /* Check if SIMD is software-emulated on a target platform. -This value is `false`, when the compile-time target has the hardware support for -at 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support -for 128-bit SIMD, this value is `true`, and all SIMD operations will likely be +This value is `true`, when the compile-time target has the hardware support for +at least 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support +for 128-bit SIMD, this value is `false`, and all SIMD operations will likely be emulated. */ -IS_EMULATED :: runtime.SIMD_IS_EMULATED +HAS_HARDWARE_SIMD :: runtime.HAS_HARDWARE_SIMD /* Vector of 16 `u8` lanes (128 bits).