diff --git a/base/runtime/random_generator.odin b/base/runtime/random_generator.odin index ca5c008d0f0..7d873fe335c 100644 --- a/base/runtime/random_generator.odin +++ b/base/runtime/random_generator.odin @@ -41,88 +41,3 @@ random_generator_reset_u64 :: proc(rg: Random_Generator, p: u64) { rg.procedure(rg.data, .Reset, ([^]byte)(&p)[:size_of(p)]) } } - - -Default_Random_State :: struct { - state: u64, - inc: u64, -} - -default_random_generator_proc :: proc(data: rawptr, mode: Random_Generator_Mode, p: []byte) { - @(require_results) - read_u64 :: proc "contextless" (r: ^Default_Random_State) -> u64 { - old_state := r.state - r.state = old_state * 6364136223846793005 + (r.inc|1) - xor_shifted := (((old_state >> 59) + 5) ~ old_state) * 12605985483714917081 - rot := (old_state >> 59) - return (xor_shifted >> rot) | (xor_shifted << ((-rot) & 63)) - } - - @(thread_local) - global_rand_seed: Default_Random_State - - init :: proc "contextless" (r: ^Default_Random_State, seed: u64) { - seed := seed - if seed == 0 { - seed = u64(intrinsics.read_cycle_counter()) - } - r.state = 0 - r.inc = (seed << 1) | 1 - _ = read_u64(r) - r.state += seed - _ = read_u64(r) - } - - r: ^Default_Random_State = --- - if data == nil { - r = &global_rand_seed - } else { - r = cast(^Default_Random_State)data - } - - switch mode { - case .Read: - if r.state == 0 && r.inc == 0 { - init(r, 0) - } - - switch len(p) { - case size_of(u64): - // Fast path for a 64-bit destination. - intrinsics.unaligned_store((^u64)(raw_data(p)), read_u64(r)) - case: - // All other cases. - pos := i8(0) - val := u64(0) - for &v in p { - if pos == 0 { - val = read_u64(r) - pos = 8 - } - v = byte(val) - val >>= 8 - pos -= 1 - } - } - - case .Reset: - seed: u64 - mem_copy_non_overlapping(&seed, raw_data(p), min(size_of(seed), len(p))) - init(r, seed) - - case .Query_Info: - if len(p) != size_of(Random_Generator_Query_Info) { - return - } - info := (^Random_Generator_Query_Info)(raw_data(p)) - info^ += {.Uniform, .Resettable} - } -} - -@(require_results) -default_random_generator :: proc "contextless" (state: ^Default_Random_State = nil) -> Random_Generator { - return { - procedure = default_random_generator_proc, - data = state, - } -} \ No newline at end of file diff --git a/base/runtime/random_generator_chacha8.odin b/base/runtime/random_generator_chacha8.odin new file mode 100644 index 00000000000..c9724981178 --- /dev/null +++ b/base/runtime/random_generator_chacha8.odin @@ -0,0 +1,178 @@ +package runtime + +import "base:intrinsics" + +// This is an implementation of the Chacha8Rand DRBG, as specified +// in https://github.com/C2SP/C2SP/blob/main/chacha8rand.md +// +// There is a tradeoff to be made between state-size and performance, +// in terms of the amount of rng output buffered. +// +// The sensible buffer sizes are: +// - 256-bytes: 128-bit SIMD with 16x vector registers (SSE2) +// - 512-bytes: 128-bit SIMD with 32x vector registers (ARMv8), +// 256-bit SIMD with 16x vector registers (AVX2), +// - 1024-bytes: AVX-512 +// +// Notes: +// - Smaller than 256-bytes is possible but would require redundant +// calls to the ChaCha8 function, which is prohibitively expensive. +// - Larger than 1024-bytes is possible but pointless as the construct +// is defined around 992-bytes of RNG output and 32-bytes of input +// per iteration. +// +// This implementation opts for a 1024-byte buffer for simplicity, +// under the rationale that modern extremely memory constrained targets +// provide suitable functionality in hardware, and the language makes +// supporting the various SIMD flavors easy. + +@(private = "file") +RNG_SEED_SIZE :: 32 +@(private) +RNG_OUTPUT_PER_ITER :: 1024 - RNG_SEED_SIZE + +@(private) +CHACHA_SIGMA_0: u32 : 0x61707865 +@(private) +CHACHA_SIGMA_1: u32 : 0x3320646e +@(private) +CHACHA_SIGMA_2: u32 : 0x79622d32 +@(private) +CHACHA_SIGMA_3: u32 : 0x6b206574 +@(private) +CHACHA_ROUNDS :: 8 + +Default_Random_State :: struct { + _buf: [1024]byte, + _off: int, + _seeded: bool, +} + +@(require_results) +default_random_generator :: proc "contextless" (state: ^Default_Random_State = nil) -> Random_Generator { + return { + procedure = default_random_generator_proc, + data = state, + } +} + +default_random_generator_proc :: proc(data: rawptr, mode: Random_Generator_Mode, p: []byte) { + @(thread_local) + state: Default_Random_State + + r: ^Default_Random_State = &state + if data != nil { + r = cast(^Default_Random_State)data + } + next_seed := r._buf[RNG_OUTPUT_PER_ITER:] + + switch mode { + case .Read: + if !r._seeded { // Unlikely. + // TODO: USE SECURE ENTROPY + weak_seed := u64(intrinsics.read_cycle_counter()) + tmp: [32]byte + intrinsics.unaligned_store((^u64)(&tmp[0]), weak_seed) + + copy_slice(next_seed, tmp[:]) + + r._off = RNG_OUTPUT_PER_ITER // Force refill. + r._seeded = true + /* + if crypto.HAS_RAND_BYTES { + crypto.rand_bytes(next_seed) + r._off = RNG_OUTPUT_PER_ITER // Force refill. + } else { + panic("chacha8rand: no system entropy source") + } + */ + } + + assert(r._off <= RNG_OUTPUT_PER_ITER, "chacha8rand/BUG: outputed key material") + if r._off >= RNG_OUTPUT_PER_ITER { // Unlikely. + chacha8rand_refill(r) + } + + // We are guaranteed to have at least some RNG output buffered. + // + // As an invariant each read will consume a multiple of 8-bytes + // of output at a time. + assert(r._off <= RNG_OUTPUT_PER_ITER - 8, "chacha8rand/BUG: less than 8-bytes of output available") + assert(r._off % 8 == 0, "chacha8rand/BUG: buffered output is not a multiple of 8-bytes") + + p_len := len(p) + if p_len == size_of(u64) { + #no_bounds_check { + // Fast path for a 64-bit destination. + src := (^u64)(raw_data(r._buf[r._off:])) + intrinsics.unaligned_store((^u64)(raw_data(p)), src^) + src^ = 0 // Erasure (backtrack resistance) + r._off += 8 + } + return + } + + p_ := p + for remaining := p_len; remaining > 0; { + sz := min(remaining, RNG_OUTPUT_PER_ITER - r._off) + #no_bounds_check { + copy(p_[:sz], r._buf[r._off:]) + p_ = p_[sz:] + remaining -= sz + } + rounded_sz := ((sz + 7) / 8) * 8 + new_off := r._off + rounded_sz + #no_bounds_check if new_off < RNG_OUTPUT_PER_ITER { + // Erasure (backtrack resistance) + intrinsics.mem_zero(raw_data(r._buf[r._off:]), rounded_sz) + r._off = new_off + } else { + // Can omit erasure since we are overwriting the entire + // buffer. + chacha8rand_refill(r) + } + } + + case .Reset: + // If no seed is passed, the next call to .Read will attempt to + // reseed from the system entropy source. + if len(p) == 0 { + r._seeded = false + return + } + + // The cryptographic security of the output depends entirely + // on the quality of the entropy in the seed, we will allow + // re-seeding (as it makes testing easier), but callers that + // decide to provide arbitrary seeds are on their own as far + // as ensuring high-quality entropy. + intrinsics.mem_zero(raw_data(next_seed), RNG_SEED_SIZE) + copy(next_seed, p) + r._seeded = true + r._off = RNG_OUTPUT_PER_ITER // Force a refill. + + case .Query_Info: + if len(p) != size_of(Random_Generator_Query_Info) { + return + } + info := (^Random_Generator_Query_Info)(raw_data(p)) + info^ += {.Uniform, .Cryptographic, .Resettable} + } +} + +@(private = "file") +chacha8rand_refill :: proc(r: ^Default_Random_State) { + assert(r._seeded == true, "chacha8rand/BUG: unseeded refill") + + // i386 has insufficient vector registers to use the + // accelerated path at the moment. + when ODIN_ARCH == .amd64 && intrinsics.has_target_feature("avx2") { + chacha8rand_refill_simd256(r) + } else when HAS_HARDWARE_SIMD && ODIN_ARCH != .i386 { + chacha8rand_refill_simd128(r) + } else { + chacha8rand_refill_ref(r) + } + + r._off = 0 +} diff --git a/base/runtime/random_generator_chacha8_ref.odin b/base/runtime/random_generator_chacha8_ref.odin new file mode 100644 index 00000000000..b1e812c3f04 --- /dev/null +++ b/base/runtime/random_generator_chacha8_ref.odin @@ -0,0 +1,145 @@ +package runtime + +import "base:intrinsics" + +@(private) +chacha8rand_refill_ref :: proc(r: ^Default_Random_State) { + // Initialize the base state. + k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:])) + when ODIN_ENDIAN == .Little { + s4 := k[0] + s5 := k[1] + s6 := k[2] + s7 := k[3] + s8 := k[4] + s9 := k[5] + s10 := k[6] + s11 := k[7] + } else { + s4 := intrinsics.byte_swap(k[0]) + s5 := intrinsics.byte_swap(k[1]) + s6 := intrinsics.byte_swap(k[2]) + s7 := intrinsics.byte_swap(k[3]) + s8 := intrinsics.byte_swap(k[4]) + s9 := intrinsics.byte_swap(k[5]) + s10 := intrinsics.byte_swap(k[6]) + s11 := intrinicss.byte_swap(k[7]) + } + s12: u32 // Counter starts at 0. + s13, s14, s15: u32 // IV of all 0s. + + dst: [^]u32 = (^u32)(raw_data(r._buf[:])) + + // At least with LLVM21 force_inline produces identical perf to + // manual inlining, yay. + quarter_round := #force_inline proc "contextless" (a, b, c, d: u32) -> (u32, u32, u32, u32) { + a, b, c, d := a, b, c, d + + a += b + d ~= a + d = rotl(d, 16) + + c += d + b ~= c + b = rotl(b, 12) + + a += b + d ~= a + d = rotl(d, 8) + + c += d + b ~= c + b = rotl(b, 7) + + return a, b, c, d + } + + // Filippo Valsorda made an observation that only one of the column + // round depends on the counter (s12), so it is worth precomputing + // and reusing across multiple blocks. As far as I know, only Go's + // chacha implementation does this. + + p1, p5, p9, p13 := quarter_round(CHACHA_SIGMA_1, s5, s9, s13) + p2, p6, p10, p14 := quarter_round(CHACHA_SIGMA_2, s6, s10, s14) + p3, p7, p11, p15 := quarter_round(CHACHA_SIGMA_3, s7, s11, s15) + + // 4 groups + for g := 0; g < 4; g = g + 1 { + // 4 blocks per group + for n := 0; n < 4; n = n + 1 { + // First column round that depends on the counter + p0, p4, p8, p12 := quarter_round(CHACHA_SIGMA_0, s4, s8, s12) + + // First diagonal round + x0, x5, x10, x15 := quarter_round(p0, p5, p10, p15) + x1, x6, x11, x12 := quarter_round(p1, p6, p11, p12) + x2, x7, x8, x13 := quarter_round(p2, p7, p8, p13) + x3, x4, x9, x14 := quarter_round(p3, p4, p9, p14) + + for i := CHACHA_ROUNDS - 2; i > 0; i = i - 2 { + x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12) + x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13) + x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14) + x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15) + + x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15) + x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12) + x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13) + x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14) + } + + // Interleave 4 blocks + // NB: The additions of sigma and the counter are omitted + STRIDE :: 4 + d_ := dst[n:] + when ODIN_ENDIAN == .Little { + d_[STRIDE*0] = x0 + d_[STRIDE*1] = x1 + d_[STRIDE*2] = x2 + d_[STRIDE*3] = x3 + d_[STRIDE*4] = x4 + s4 + d_[STRIDE*5] = x5 + s5 + d_[STRIDE*6] = x6 + s6 + d_[STRIDE*7] = x7 + s7 + d_[STRIDE*8] = x8 + s8 + d_[STRIDE*9] = x9 + s9 + d_[STRIDE*10] = x10 + s10 + d_[STRIDE*11] = x11 + s11 + d_[STRIDE*12] = x12 + d_[STRIDE*13] = x13 + s13 + d_[STRIDE*14] = x14 + s14 + d_[STRIDE*15] = x15 + s15 + } else { + d_[STRIDE*0] = intrinsics.byte_swap(x0) + d_[STRIDE*1] = intrinsics.byte_swap(x1) + d_[STRIDE*2] = intrinsics.byte_swap(x2) + d_[STRIDE*3] = intrinsics.byte_swap(x3) + d_[STRIDE*4] = intrinsics.byte_swap(x4 + s4) + d_[STRIDE*5] = intrinsics.byte_swap(x5 + s5) + d_[STRIDE*6] = intrinsics.byte_swap(x6 + s6) + d_[STRIDE*7] = intrinsics.byte_swap(x7 + s7) + d_[STRIDE*8] = intrinsics.byte_swap(x8 + s8) + d_[STRIDE*9] = intrinsics.byte_swap(x9 + s9) + d_[STRIDE*10] = intrinsics.byte_swap(x10 + s10) + d_[STRIDE*11] = intrinsics.byte_swap(x11 + s11) + d_[STRIDE*12] = intrinsics.byte_swap(x12) + d_[STRIDE*13] = intrinsics.byte_swap(x13 + s13) + d_[STRIDE*14] = intrinsics.byte_swap(x14 + s14) + d_[STRIDE*15] = intrinsics.byte_swap(x15 + s15) + } + + s12 = s12 + 1 // Increment the counter + } + + dst = dst[16*4:] + } +} + +// This replicates `rotate_left32` from `core:math/bits`, under the +// assumption that this will live in `base:runtime`. +@(require_results, private = "file") +rotl :: #force_inline proc "contextless" (x: u32, k: int) -> u32 { + n :: 32 + s := uint(k) & (n-1) + return x << s | x >> (n-s) +} diff --git a/base/runtime/random_generator_chacha8_simd128.odin b/base/runtime/random_generator_chacha8_simd128.odin new file mode 100644 index 00000000000..d63d9262057 --- /dev/null +++ b/base/runtime/random_generator_chacha8_simd128.odin @@ -0,0 +1,290 @@ +#+build !i386 +package runtime + +import "base:intrinsics" + +@(private = "file") +u32x4 :: #simd[4]u32 + +@(private = "file") +S0: u32x4 : {CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0} +@(private = "file") +S1: u32x4 : {CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1} +@(private = "file") +S2: u32x4 : {CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2} +@(private = "file") +S3: u32x4 : {CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3} + +@(private = "file") +_ROT_7L: u32x4 : {7, 7, 7, 7} +@(private = "file") +_ROT_7R: u32x4 : {25, 25, 25, 25} +@(private = "file") +_ROT_12L: u32x4 : {12, 12, 12, 12} +@(private = "file") +_ROT_12R: u32x4 : {20, 20, 20, 20} +@(private = "file") +_ROT_8L: u32x4 : {8, 8, 8, 8} +@(private = "file") +_ROT_8R: u32x4 : {24, 24, 24, 24} +@(private = "file") +_ROT_16: u32x4 : {16, 16, 16, 16} +@(private = "file") +_CTR_INC_4: u32x4 : {4, 4, 4, 4} +@(private = "file") +_CTR_INC_8: u32x4 : {8, 8, 8, 8} + +when ODIN_ENDIAN == .Big { + @(private = "file") + _byteswap_u32x4 :: #force_inline proc "contextless" (v: u32x4) -> u32x4 { + u8x16 :: #simd[16]u8 + return( + transmute(u32x4)simd.shuffle( + transmute(u8x16)v, + transmute(u8x16)v, + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, + ) + ) + } +} + +@(private) +chacha8rand_refill_simd128 :: proc(r: ^Default_Random_State) { + // Initialize the base state. + k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:])) + when ODIN_ENDIAN == .Little { + s4_ := k[0] + s5_ := k[1] + s6_ := k[2] + s7_ := k[3] + s8_ := k[4] + s9_ := k[5] + s10_ := k[6] + s11_ := k[7] + } else { + s4_ := intrinsics.byte_swap(k[0]) + s5_ := intrinsics.byte_swap(k[1]) + s6_ := intrinsics.byte_swap(k[2]) + s7_ := intrinsics.byte_swap(k[3]) + s8_ := intrinsics.byte_swap(k[4]) + s9_ := intrinsics.byte_swap(k[5]) + s10_ := intrinsics.byte_swap(k[6]) + s11_ := intrinicss.byte_swap(k[7]) + } + + // 4-lane ChaCha8. + s4 := u32x4{s4_, s4_, s4_, s4_} + s5 := u32x4{s5_, s5_, s5_, s5_} + s6 := u32x4{s6_, s6_, s6_, s6_} + s7 := u32x4{s7_, s7_, s7_, s7_} + s8 := u32x4{s8_, s8_, s8_, s8_} + s9 := u32x4{s9_, s9_, s9_, s9_} + s10 := u32x4{s10_, s10_, s10_, s10_} + s11 := u32x4{s11_, s11_, s11_, s11_} + s12 := u32x4{0, 1, 2, 3} + s13, s14, s15: u32x4 + + dst: [^]u32x4 = (^u32x4)(raw_data(r._buf[:])) + + quarter_round := #force_inline proc "contextless" (a, b, c, d: u32x4) -> (u32x4, u32x4, u32x4, u32x4) { + a, b, c, d := a, b, c, d + + a = intrinsics.simd_add(a, b) + d = intrinsics.simd_bit_xor(d, a) + d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_16), intrinsics.simd_shr(d, _ROT_16)) + + c = intrinsics.simd_add(c, d) + b = intrinsics.simd_bit_xor(b, c) + b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_12L), intrinsics.simd_shr(b, _ROT_12R)) + + a = intrinsics.simd_add(a, b) + d = intrinsics.simd_bit_xor(d, a) + d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_8L), intrinsics.simd_shr(d, _ROT_8R)) + + c = intrinsics.simd_add(c, d) + b = intrinsics.simd_bit_xor(b, c) + b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_7L), intrinsics.simd_shr(b, _ROT_7R)) + + return a, b, c, d + } + + // 8 blocks at a time. + // + // Note: + // This uses a ton of registers so it is only worth it on targets + // that have something like 32 128-bit registers. This is currently + // all ARMv8 targets, and RISC-V Zvl128b (`V` application profile) + // targets. + // + // While our current definition of `.arm32` is 32-bit ARMv8, this + // may change in the future (ARMv7 is still relevant), and things + // like Cortex-A8/A9 does "pretend" 128-bit SIMD 64-bits at a time + // thus needs bemchmarking. + when ODIN_ARCH == .arm64 || ODIN_ARCH == .riscv64 { + for _ in 0..<2 { + x0_0, x1_0, x2_0, x3_0 := S0, S1, S2, S3 + x4_0, x5_0, x6_0, x7_0 := s4, s5, s6, s7 + x8_0, x9_0, x10_0, x11_0 := s8, s9, s10, s11 + x12_0, x13_0, x14_0, x15_0 := s12, s13, s14, s15 + + x0_1, x1_1, x2_1, x3_1 := S0, S1, S2, S3 + x4_1, x5_1, x6_1, x7_1 := s4, s5, s6, s7 + x8_1, x9_1, x10_1, x11_1 := s8, s9, s10, s11 + x12_1 := intrinsics.simd_add(s12, _CTR_INC_4) + x13_1, x14_1, x15_1 := s13, s14, s15 + + for i := CHACHA_ROUNDS; i > 0; i = i - 2 { + x0_0, x4_0, x8_0, x12_0 = quarter_round(x0_0, x4_0, x8_0, x12_0) + x0_1, x4_1, x8_1, x12_1 = quarter_round(x0_1, x4_1, x8_1, x12_1) + x1_0, x5_0, x9_0, x13_0 = quarter_round(x1_0, x5_0, x9_0, x13_0) + x1_1, x5_1, x9_1, x13_1 = quarter_round(x1_1, x5_1, x9_1, x13_1) + x2_0, x6_0, x10_0, x14_0 = quarter_round(x2_0, x6_0, x10_0, x14_0) + x2_1, x6_1, x10_1, x14_1 = quarter_round(x2_1, x6_1, x10_1, x14_1) + x3_0, x7_0, x11_0, x15_0 = quarter_round(x3_0, x7_0, x11_0, x15_0) + x3_1, x7_1, x11_1, x15_1 = quarter_round(x3_1, x7_1, x11_1, x15_1) + + x0_0, x5_0, x10_0, x15_0 = quarter_round(x0_0, x5_0, x10_0, x15_0) + x0_1, x5_1, x10_1, x15_1 = quarter_round(x0_1, x5_1, x10_1, x15_1) + x1_0, x6_0, x11_0, x12_0 = quarter_round(x1_0, x6_0, x11_0, x12_0) + x1_1, x6_1, x11_1, x12_1 = quarter_round(x1_1, x6_1, x11_1, x12_1) + x2_0, x7_0, x8_0, x13_0 = quarter_round(x2_0, x7_0, x8_0, x13_0) + x2_1, x7_1, x8_1, x13_1 = quarter_round(x2_1, x7_1, x8_1, x13_1) + x3_0, x4_0, x9_0, x14_0 = quarter_round(x3_0, x4_0, x9_0, x14_0) + x3_1, x4_1, x9_1, x14_1 = quarter_round(x3_1, x4_1, x9_1, x14_1) + } + + when ODIN_ENDIAN == .Little { + intrinsics.unaligned_store((^u32x4)(dst[0:]), x0_0) + intrinsics.unaligned_store((^u32x4)(dst[1:]), x1_0) + intrinsics.unaligned_store((^u32x4)(dst[2:]), x2_0) + intrinsics.unaligned_store((^u32x4)(dst[3:]), x3_0) + intrinsics.unaligned_store((^u32x4)(dst[4:]), intrinsics.simd_add(x4_0, s4)) + intrinsics.unaligned_store((^u32x4)(dst[5:]), intrinsics.simd_add(x5_0, s5)) + intrinsics.unaligned_store((^u32x4)(dst[6:]), intrinsics.simd_add(x6_0, s6)) + intrinsics.unaligned_store((^u32x4)(dst[7:]), intrinsics.simd_add(x7_0, s7)) + intrinsics.unaligned_store((^u32x4)(dst[8:]), intrinsics.simd_add(x8_0, s8)) + intrinsics.unaligned_store((^u32x4)(dst[9:]), intrinsics.simd_add(x9_0, s9)) + intrinsics.unaligned_store((^u32x4)(dst[10:]), intrinsics.simd_add(x10_0, s10)) + intrinsics.unaligned_store((^u32x4)(dst[11:]), intrinsics.simd_add(x11_0, s11)) + intrinsics.unaligned_store((^u32x4)(dst[12:]), x12_0) + intrinsics.unaligned_store((^u32x4)(dst[13:]), intrinsics.simd_add(x13_0, s13)) + intrinsics.unaligned_store((^u32x4)(dst[14:]), intrinsics.simd_add(x14_0, s14)) + intrinsics.unaligned_store((^u32x4)(dst[15:]), intrinsics.simd_add(x15_0, s15)) + + intrinsics.unaligned_store((^u32x4)(dst[16:]), x0_1) + intrinsics.unaligned_store((^u32x4)(dst[17:]), x1_1) + intrinsics.unaligned_store((^u32x4)(dst[18:]), x2_1) + intrinsics.unaligned_store((^u32x4)(dst[19:]), x3_1) + intrinsics.unaligned_store((^u32x4)(dst[20:]), intrinsics.simd_add(x4_1, s4)) + intrinsics.unaligned_store((^u32x4)(dst[21:]), intrinsics.simd_add(x5_1, s5)) + intrinsics.unaligned_store((^u32x4)(dst[22:]), intrinsics.simd_add(x6_1, s6)) + intrinsics.unaligned_store((^u32x4)(dst[23:]), intrinsics.simd_add(x7_1, s7)) + intrinsics.unaligned_store((^u32x4)(dst[24:]), intrinsics.simd_add(x8_1, s8)) + intrinsics.unaligned_store((^u32x4)(dst[25:]), intrinsics.simd_add(x9_1, s9)) + intrinsics.unaligned_store((^u32x4)(dst[26:]), intrinsics.simd_add(x10_1, s10)) + intrinsics.unaligned_store((^u32x4)(dst[27:]), intrinsics.simd_add(x11_1, s11)) + intrinsics.unaligned_store((^u32x4)(dst[28:]), x12_1) + intrinsics.unaligned_store((^u32x4)(dst[29:]), intrinsics.simd_add(x13_1, s13)) + intrinsics.unaligned_store((^u32x4)(dst[30:]), intrinsics.simd_add(x14_1, s14)) + intrinsics.unaligned_store((^u32x4)(dst[31:]), intrinsics.simd_add(x15_1, s15)) + } else { + intrinsics.unaligned_store((^u32x4)(dst[0:]), _byteswap_u32x4(x0_0)) + intrinsics.unaligned_store((^u32x4)(dst[1:]), _byteswap_u32x4(x1_0)) + intrinsics.unaligned_store((^u32x4)(dst[2:]), _byteswap_u32x4(x2_0)) + intrinsics.unaligned_store((^u32x4)(dst[3:]), _byteswap_u32x4(x3_0)) + intrinsics.unaligned_store((^u32x4)(dst[4:]), _byteswap_u32x4(intrinsics.simd_add(x4_0, s4))) + intrinsics.unaligned_store((^u32x4)(dst[5:]), _byteswap_u32x4(intrinsics.simd_add(x5_0, s5))) + intrinsics.unaligned_store((^u32x4)(dst[6:]), _byteswap_u32x4(intrinsics.simd_add(x6_0, s6))) + intrinsics.unaligned_store((^u32x4)(dst[7:]), _byteswap_u32x4(intrinsics.simd_add(x7_0, s7))) + intrinsics.unaligned_store((^u32x4)(dst[8:]), _byteswap_u32x4(intrinsics.simd_add(x8_0, s8))) + intrinsics.unaligned_store((^u32x4)(dst[9:]), _byteswap_u32x4(intrinsics.simd_add(x9_0, s9))) + intrinsics.unaligned_store((^u32x4)(dst[10:]), _byteswap_u32x4(intrinsics.simd_add(x10_0, s10))) + intrinsics.unaligned_store((^u32x4)(dst[11:]), _byteswap_u32x4(intrinsics.simd_add(x11_0, s11))) + intrinsics.unaligned_store((^u32x4)(dst[12:]), _byteswap_u32x4(x12_0)) + intrinsics.unaligned_store((^u32x4)(dst[13:]), _byteswap_u32x4(intrinsics.simd_add(x13_0, s13))) + intrinsics.unaligned_store((^u32x4)(dst[14:]), _byteswap_u32x4(intrinsics.simd_add(x14_0, s14))) + intrinsics.unaligned_store((^u32x4)(dst[15:]), _byteswap_u32x4(intrinsics.simd_add(x15_0, s15))) + + intrinsics.unaligned_store((^u32x4)(dst[16:]), _byteswap_u32x4(x0_1)) + intrinsics.unaligned_store((^u32x4)(dst[17:]), _byteswap_u32x4(x1_1)) + intrinsics.unaligned_store((^u32x4)(dst[18:]), _byteswap_u32x4(x2_1)) + intrinsics.unaligned_store((^u32x4)(dst[19:]), _byteswap_u32x4(x3_1)) + intrinsics.unaligned_store((^u32x4)(dst[20:]), _byteswap_u32x4(intrinsics.simd_add(x4_1, s4))) + intrinsics.unaligned_store((^u32x4)(dst[21:]), _byteswap_u32x4(intrinsics.simd_add(x5_1, s5))) + intrinsics.unaligned_store((^u32x4)(dst[22:]), _byteswap_u32x4(intrinsics.simd_add(x6_1, s6))) + intrinsics.unaligned_store((^u32x4)(dst[23:]), _byteswap_u32x4(intrinsics.simd_add(x7_1, s7))) + intrinsics.unaligned_store((^u32x4)(dst[24:]), _byteswap_u32x4(intrinsics.simd_add(x8_1, s8))) + intrinsics.unaligned_store((^u32x4)(dst[25:]), _byteswap_u32x4(intrinsics.simd_add(x9_1, s9))) + intrinsics.unaligned_store((^u32x4)(dst[26:]), _byteswap_u32x4(intrinsics.simd_add(x10_1, s10))) + intrinsics.unaligned_store((^u32x4)(dst[27:]), _byteswap_u32x4(intrinsics.simd_add(x11_1, s11))) + intrinsics.unaligned_store((^u32x4)(dst[28:]), _byteswap_u32x4(x12_1)) + intrinsics.unaligned_store((^u32x4)(dst[29:]), _byteswap_u32x4(intrinsics.simd_add(x13_1, s13))) + intrinsics.unaligned_store((^u32x4)(dst[30:]), _byteswap_u32x4(intrinsics.simd_add(x14_1, s14))) + intrinsics.unaligned_store((^u32x4)(dst[31:]), _byteswap_u32x4(intrinsics.simd_add(x15_1, s15))) + } + + s12 = intrinsics.simd_add(s12, _CTR_INC_8) + + dst = dst[32:] + } + } else { + for _ in 0..<4 { + x0, x1, x2, x3 := S0, S1, S2, S3 + x4, x5, x6, x7 := s4, s5, s6, s7 + x8, x9, x10, x11 := s8, s9, s10, s11 + x12, x13, x14, x15 := s12, s13, s14, s15 + + for i := CHACHA_ROUNDS; i > 0; i = i - 2 { + x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12) + x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13) + x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14) + x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15) + + x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15) + x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12) + x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13) + x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14) + } + + when ODIN_ENDIAN == .Little { + intrinsics.unaligned_store((^u32x4)(dst[0:]), x0) + intrinsics.unaligned_store((^u32x4)(dst[1:]), x1) + intrinsics.unaligned_store((^u32x4)(dst[2:]), x2) + intrinsics.unaligned_store((^u32x4)(dst[3:]), x3) + intrinsics.unaligned_store((^u32x4)(dst[4:]), intrinsics.simd_add(x4, s4)) + intrinsics.unaligned_store((^u32x4)(dst[5:]), intrinsics.simd_add(x5, s5)) + intrinsics.unaligned_store((^u32x4)(dst[6:]), intrinsics.simd_add(x6, s6)) + intrinsics.unaligned_store((^u32x4)(dst[7:]), intrinsics.simd_add(x7, s7)) + intrinsics.unaligned_store((^u32x4)(dst[8:]), intrinsics.simd_add(x8, s8)) + intrinsics.unaligned_store((^u32x4)(dst[9:]), intrinsics.simd_add(x9, s9)) + intrinsics.unaligned_store((^u32x4)(dst[10:]), intrinsics.simd_add(x10, s10)) + intrinsics.unaligned_store((^u32x4)(dst[11:]), intrinsics.simd_add(x11, s11)) + intrinsics.unaligned_store((^u32x4)(dst[12:]), x12) + intrinsics.unaligned_store((^u32x4)(dst[13:]), intrinsics.simd_add(x13, s13)) + intrinsics.unaligned_store((^u32x4)(dst[14:]), intrinsics.simd_add(x14, s14)) + intrinsics.unaligned_store((^u32x4)(dst[15:]), intrinsics.simd_add(x15, s15)) + } else { + intrinsics.unaligned_store((^u32x4)(dst[0:]), _byteswap_u32x4(x0)) + intrinsics.unaligned_store((^u32x4)(dst[1:]), _byteswap_u32x4(x1)) + intrinsics.unaligned_store((^u32x4)(dst[2:]), _byteswap_u32x4(x2)) + intrinsics.unaligned_store((^u32x4)(dst[3:]), _byteswap_u32x4(x3)) + intrinsics.unaligned_store((^u32x4)(dst[4:]), _byteswap_u32x4(intrinsics.simd_add(x4, s4))) + intrinsics.unaligned_store((^u32x4)(dst[5:]), _byteswap_u32x4(intrinsics.simd_add(x5, s5))) + intrinsics.unaligned_store((^u32x4)(dst[6:]), _byteswap_u32x4(intrinsics.simd_add(x6, s6))) + intrinsics.unaligned_store((^u32x4)(dst[7:]), _byteswap_u32x4(intrinsics.simd_add(x7, s7))) + intrinsics.unaligned_store((^u32x4)(dst[8:]), _byteswap_u32x4(intrinsics.simd_add(x8, s8))) + intrinsics.unaligned_store((^u32x4)(dst[9:]), _byteswap_u32x4(intrinsics.simd_add(x9, s9))) + intrinsics.unaligned_store((^u32x4)(dst[10:]), _byteswap_u32x4(intrinsics.simd_add(x10, s10))) + intrinsics.unaligned_store((^u32x4)(dst[11:]), _byteswap_u32x4(intrinsics.simd_add(x11, s11))) + intrinsics.unaligned_store((^u32x4)(dst[12:]), _byteswap_u32x4(x12)) + intrinsics.unaligned_store((^u32x4)(dst[13:]), _byteswap_u32x4(intrinsics.simd_add(x13, s13))) + intrinsics.unaligned_store((^u32x4)(dst[14:]), _byteswap_u32x4(intrinsics.simd_add(x14, s14))) + intrinsics.unaligned_store((^u32x4)(dst[15:]), _byteswap_u32x4(intrinsics.simd_add(x15, s15))) + } + + s12 = intrinsics.simd_add(s12, _CTR_INC_4) + + dst = dst[16:] + } + } +} diff --git a/base/runtime/random_generator_chacha8_simd256.odin b/base/runtime/random_generator_chacha8_simd256.odin new file mode 100644 index 00000000000..c0985f456af --- /dev/null +++ b/base/runtime/random_generator_chacha8_simd256.odin @@ -0,0 +1,197 @@ +#+build amd64 +package runtime + +import "base:intrinsics" + +#assert(ODIN_ENDIAN == .Little) + +@(private = "file") +u32x8 :: #simd[8]u32 +@(private = "file") +u32x4 :: #simd[4]u32 + +@(private = "file") +S0: u32x8 : { + CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, + CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, CHACHA_SIGMA_0, +} +@(private = "file") +S1: u32x8 : { + CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, + CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, CHACHA_SIGMA_1, +} +@(private = "file") +S2: u32x8 : { + CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, + CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, CHACHA_SIGMA_2, +} +@(private = "file") +S3: u32x8 : { + CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, + CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, CHACHA_SIGMA_3, +} + +@(private = "file") +_ROT_7L: u32x8 : {7, 7, 7, 7, 7, 7, 7, 7} +@(private = "file") +_ROT_7R: u32x8 : {25, 25, 25, 25, 25, 25, 25, 25} +@(private = "file") +_ROT_12L: u32x8 : {12, 12, 12, 12, 12, 12, 12, 12} +@(private = "file") +_ROT_12R: u32x8 : {20, 20, 20, 20, 20, 20, 20, 20} +@(private = "file") +_ROT_8L: u32x8 : {8, 8, 8, 8, 8, 8, 8, 8} +@(private = "file") +_ROT_8R: u32x8 : {24, 24, 24, 24, 24, 24, 24, 24} +@(private = "file") +_ROT_16: u32x8 : {16, 16, 16, 16, 16, 16, 16, 16} +@(private = "file") +_CTR_INC_8: u32x8 : {8, 8, 8, 8, 8, 8, 8, 8} + +// To the best of my knowledge this is only really useful on +// modern x86-64 as most ARM silicon is missing support for SVE2. + +@(private, enable_target_feature = "avx,avx2") +chacha8rand_refill_simd256 :: proc(r: ^Default_Random_State) { + // Initialize the base state. + k: [^]u32 = (^u32)(raw_data(r._buf[RNG_OUTPUT_PER_ITER:])) + s4_ := k[0] + s5_ := k[1] + s6_ := k[2] + s7_ := k[3] + s8_ := k[4] + s9_ := k[5] + s10_ := k[6] + s11_ := k[7] + + // 8-lane ChaCha8. + s4 := u32x8{s4_, s4_, s4_, s4_, s4_, s4_, s4_, s4_} + s5 := u32x8{s5_, s5_, s5_, s5_, s5_, s5_, s5_, s5_} + s6 := u32x8{s6_, s6_, s6_, s6_, s6_, s6_, s6_, s6_} + s7 := u32x8{s7_, s7_, s7_, s7_, s7_, s7_, s7_, s7_} + s8 := u32x8{s8_, s8_, s8_, s8_, s8_, s8_, s8_, s8_} + s9 := u32x8{s9_, s9_, s9_, s9_, s9_, s9_, s9_, s9_} + s10 := u32x8{s10_, s10_, s10_, s10_, s10_, s10_, s10_, s10_} + s11 := u32x8{s11_, s11_, s11_, s11_, s11_, s11_, s11_, s11_} + s12 := u32x8{0, 1, 2, 3, 4, 5, 6, 7} + s13, s14, s15: u32x8 + + u32x4 :: #simd[4]u32 + dst: [^]u32x4 = (^u32x4)(raw_data(r._buf[:])) + + quarter_round := #force_inline proc "contextless" (a, b, c, d: u32x8) -> (u32x8, u32x8, u32x8, u32x8) { + a, b, c, d := a, b, c, d + + a = intrinsics.simd_add(a, b) + d = intrinsics.simd_bit_xor(d, a) + d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_16), intrinsics.simd_shr(d, _ROT_16)) + + c = intrinsics.simd_add(c, d) + b = intrinsics.simd_bit_xor(b, c) + b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_12L), intrinsics.simd_shr(b, _ROT_12R)) + + a = intrinsics.simd_add(a, b) + d = intrinsics.simd_bit_xor(d, a) + d = intrinsics.simd_bit_xor(intrinsics.simd_shl(d, _ROT_8L), intrinsics.simd_shr(d, _ROT_8R)) + + c = intrinsics.simd_add(c, d) + b = intrinsics.simd_bit_xor(b, c) + b = intrinsics.simd_bit_xor(intrinsics.simd_shl(b, _ROT_7L), intrinsics.simd_shr(b, _ROT_7R)) + + return a, b, c, d + } + + for _ in 0..<2 { + x0, x1, x2, x3 := S0, S1, S2, S3 + x4, x5, x6, x7 := s4, s5, s6, s7 + x8, x9, x10, x11 := s8, s9, s10, s11 + x12, x13, x14, x15 := s12, s13, s14, s15 + + for i := CHACHA_ROUNDS; i > 0; i = i - 2 { + x0, x4, x8, x12 = quarter_round(x0, x4, x8, x12) + x1, x5, x9, x13 = quarter_round(x1, x5, x9, x13) + x2, x6, x10, x14 = quarter_round(x2, x6, x10, x14) + x3, x7, x11, x15 = quarter_round(x3, x7, x11, x15) + + x0, x5, x10, x15 = quarter_round(x0, x5, x10, x15) + x1, x6, x11, x12 = quarter_round(x1, x6, x11, x12) + x2, x7, x8, x13 = quarter_round(x2, x7, x8, x13) + x3, x4, x9, x14 = quarter_round(x3, x4, x9, x14) + } + + x4 = intrinsics.simd_add(x4, s4) + x5 = intrinsics.simd_add(x5, s5) + x6 = intrinsics.simd_add(x6, s6) + x7 = intrinsics.simd_add(x7, s7) + x8 = intrinsics.simd_add(x8, s8) + x9 = intrinsics.simd_add(x9, s9) + x10 = intrinsics.simd_add(x10, s10) + x11 = intrinsics.simd_add(x11, s11) + x13 = intrinsics.simd_add(x13, s13) + x14 = intrinsics.simd_add(x14, s14) + x15 = intrinsics.simd_add(x15, s15) + + // Ok, now we have x0->x15 with 8 lanes, but we need to + // output the first 4 blocks, then the second 4 blocks. + // + // LLVM appears not to consider "this instruction is totally + // awful on the given microarchitcture", which leads to + // `VPCOMPRESSED` being generated iff AVX512 support is + // enabled for `intrinsics.simd_masked_compress_store`. + // On Zen 4, this leads to a 50% performance regression vs + // the 128-bit SIMD code. + // + // The fake intrinsic (because LLVM doesn't appear to have + // an amd64 specific one), doesn't generate `VEXTRACTI128`, + // but instead does cleverness without horrible regressions. + + intrinsics.unaligned_store((^u32x4)(dst[0:]), _mm_mm256_extracti128_si256(x0, 0)) + intrinsics.unaligned_store((^u32x4)(dst[1:]), _mm_mm256_extracti128_si256(x1, 0)) + intrinsics.unaligned_store((^u32x4)(dst[2:]), _mm_mm256_extracti128_si256(x2, 0)) + intrinsics.unaligned_store((^u32x4)(dst[3:]), _mm_mm256_extracti128_si256(x3, 0)) + intrinsics.unaligned_store((^u32x4)(dst[4:]), _mm_mm256_extracti128_si256(x4, 0)) + intrinsics.unaligned_store((^u32x4)(dst[5:]), _mm_mm256_extracti128_si256(x5, 0)) + intrinsics.unaligned_store((^u32x4)(dst[6:]), _mm_mm256_extracti128_si256(x6, 0)) + intrinsics.unaligned_store((^u32x4)(dst[7:]), _mm_mm256_extracti128_si256(x7, 0)) + intrinsics.unaligned_store((^u32x4)(dst[8:]), _mm_mm256_extracti128_si256(x8, 0)) + intrinsics.unaligned_store((^u32x4)(dst[9:]), _mm_mm256_extracti128_si256(x9, 0)) + intrinsics.unaligned_store((^u32x4)(dst[10:]), _mm_mm256_extracti128_si256(x10, 0)) + intrinsics.unaligned_store((^u32x4)(dst[11:]), _mm_mm256_extracti128_si256(x11, 0)) + intrinsics.unaligned_store((^u32x4)(dst[12:]), _mm_mm256_extracti128_si256(x12, 0)) + intrinsics.unaligned_store((^u32x4)(dst[13:]), _mm_mm256_extracti128_si256(x13, 0)) + intrinsics.unaligned_store((^u32x4)(dst[14:]), _mm_mm256_extracti128_si256(x14, 0)) + intrinsics.unaligned_store((^u32x4)(dst[15:]), _mm_mm256_extracti128_si256(x15, 0)) + + intrinsics.unaligned_store((^u32x4)(dst[16:]), _mm_mm256_extracti128_si256(x0, 1)) + intrinsics.unaligned_store((^u32x4)(dst[17:]), _mm_mm256_extracti128_si256(x1, 1)) + intrinsics.unaligned_store((^u32x4)(dst[18:]), _mm_mm256_extracti128_si256(x2, 1)) + intrinsics.unaligned_store((^u32x4)(dst[19:]), _mm_mm256_extracti128_si256(x3, 1)) + intrinsics.unaligned_store((^u32x4)(dst[20:]), _mm_mm256_extracti128_si256(x4, 1)) + intrinsics.unaligned_store((^u32x4)(dst[21:]), _mm_mm256_extracti128_si256(x5, 1)) + intrinsics.unaligned_store((^u32x4)(dst[22:]), _mm_mm256_extracti128_si256(x6, 1)) + intrinsics.unaligned_store((^u32x4)(dst[23:]), _mm_mm256_extracti128_si256(x7, 1)) + intrinsics.unaligned_store((^u32x4)(dst[24:]), _mm_mm256_extracti128_si256(x8, 1)) + intrinsics.unaligned_store((^u32x4)(dst[25:]), _mm_mm256_extracti128_si256(x9, 1)) + intrinsics.unaligned_store((^u32x4)(dst[26:]), _mm_mm256_extracti128_si256(x10, 1)) + intrinsics.unaligned_store((^u32x4)(dst[27:]), _mm_mm256_extracti128_si256(x11, 1)) + intrinsics.unaligned_store((^u32x4)(dst[28:]), _mm_mm256_extracti128_si256(x12, 1)) + intrinsics.unaligned_store((^u32x4)(dst[29:]), _mm_mm256_extracti128_si256(x13, 1)) + intrinsics.unaligned_store((^u32x4)(dst[30:]), _mm_mm256_extracti128_si256(x14, 1)) + intrinsics.unaligned_store((^u32x4)(dst[31:]), _mm_mm256_extracti128_si256(x15, 1)) + + s12 = intrinsics.simd_add(s12, _CTR_INC_8) + + dst = dst[32:] + } +} + +@(private = "file", require_results, enable_target_feature="avx2") +_mm_mm256_extracti128_si256 :: #force_inline proc "c" (a: u32x8, $OFFSET: int) -> u32x4 { + when OFFSET == 0 { + return intrinsics.simd_shuffle(a, a, 0, 1, 2, 3) + } else when OFFSET == 1 { + return intrinsics.simd_shuffle(a, a, 4, 5, 6, 7) + } else { + #panic("chacha8rand: invalid offset") + } +} diff --git a/core/math/rand/rand.odin b/core/math/rand/rand.odin index e8383ca9e5b..3cb0918e322 100644 --- a/core/math/rand/rand.odin +++ b/core/math/rand/rand.odin @@ -11,15 +11,49 @@ Generator :: runtime.Random_Generator Generator_Query_Info :: runtime.Random_Generator_Query_Info Default_Random_State :: runtime.Default_Random_State + +/* +Returns an instance of the runtime pseudorandom generator. If no +initial state is provided, the PRNG will be lazily initialized with +entropy from the system entropy source on first-use. + +The cryptographic security of the returned random number generator +is directly dependent on the quality of the initialization entropy. +Calling `reset`/`create` SHOULD be done with no seed/state, or +32-bytes of high-quality entropy. + +WARNING: +- The lazy initialization will panic if there is no system entropy + source available. +- While the generator is cryptographically secure, developers SHOULD + prefer `crypto.random_generator()`. + +Inputs: +- state: Optional initial PRNG state. + +Returns: +- A `Generator` instance. +*/ default_random_generator :: runtime.default_random_generator @(require_results) -create :: proc(seed: u64) -> (state: Default_Random_State) { +create_u64 :: proc(seed: u64) -> (state: Default_Random_State) { seed := seed runtime.default_random_generator_proc(&state, .Reset, ([^]byte)(&seed)[:size_of(seed)]) return } +@(require_results) +create_bytes :: proc(seed: []byte) -> (state: Default_Random_State) { + runtime.default_random_generator_proc(&state, .Reset, seed) + return +} + +create :: proc { + create_u64, + create_bytes, +} + /* Reset the seed used by the context.random_generator. @@ -39,10 +73,14 @@ Possible Output: 10 */ -reset :: proc(seed: u64, gen := context.random_generator) { - runtime.random_generator_reset_u64(gen, seed) +reset :: proc { + reset_u64, + reset_bytes, } +reset_u64 :: proc(seed: u64, gen := context.random_generator) { + runtime.random_generator_reset_u64(gen, seed) +} reset_bytes :: proc(bytes: []byte, gen := context.random_generator) { runtime.random_generator_reset_bytes(gen, bytes) diff --git a/core/math/rand/rand_pcg.odin b/core/math/rand/rand_pcg.odin new file mode 100644 index 00000000000..009e139be8c --- /dev/null +++ b/core/math/rand/rand_pcg.odin @@ -0,0 +1,107 @@ +package rand + +import "base:intrinsics" +import "base:runtime" + +/* +The state for a PCG64 RXS-M-XS pseudorandom generator. +*/ +PCG_Random_State :: struct { + state: u64, + inc: u64, +} + +pcg_random_generator_proc :: proc(data: rawptr, mode: runtime.Random_Generator_Mode, p: []byte) { + @(require_results) + read_u64 :: proc "contextless" (r: ^PCG_Random_State) -> u64 { + old_state := r.state + r.state = old_state * 6364136223846793005 + (r.inc|1) + xor_shifted := (((old_state >> 59) + 5) ~ old_state) * 12605985483714917081 + rot := (old_state >> 59) + return (xor_shifted >> rot) | (xor_shifted << ((-rot) & 63)) + } + + @(thread_local) + global_rand_seed: PCG_Random_State + + init :: proc "contextless" (r: ^PCG_Random_State, seed: u64) { + seed := seed + if seed == 0 { + seed = u64(intrinsics.read_cycle_counter()) + } + r.state = 0 + r.inc = (seed << 1) | 1 + _ = read_u64(r) + r.state += seed + _ = read_u64(r) + } + + r: ^PCG_Random_State = --- + if data == nil { + r = &global_rand_seed + } else { + r = cast(^PCG_Random_State)data + } + + switch mode { + case .Read: + if r.state == 0 && r.inc == 0 { + init(r, 0) + } + + switch len(p) { + case size_of(u64): + // Fast path for a 64-bit destination. + intrinsics.unaligned_store((^u64)(raw_data(p)), read_u64(r)) + case: + // All other cases. + pos := i8(0) + val := u64(0) + for &v in p { + if pos == 0 { + val = read_u64(r) + pos = 8 + } + v = byte(val) + val >>= 8 + pos -= 1 + } + } + + case .Reset: + seed: u64 + runtime.mem_copy_non_overlapping(&seed, raw_data(p), min(size_of(seed), len(p))) + init(r, seed) + + case .Query_Info: + if len(p) != size_of(Generator_Query_Info) { + return + } + info := (^Generator_Query_Info)(raw_data(p)) + info^ += {.Uniform, .Resettable} + } +} + +/* +Returns an instance of the PGC64 RXS-M-XS pseudorandom generator. If no +initial state is provided, the PRNG will be lazily initialized with the +system timestamp counter on first-use. + +WARNING: This random number generator is NOT cryptographically secure, +and is additionally known to be flawed. It is only included for +backward compatibility with historical releases of Odin. +See: https://github.com/odin-lang/Odin/issues/5881 + +Inputs: +- state: Optional initial PRNG state. + +Returns: +- A `Generator` instance. +*/ +@(require_results) +pcg_random_generator :: proc "contextless" (state: ^PCG_Random_State = nil) -> Generator { + return { + procedure = pcg_random_generator_proc, + data = state, + } +} diff --git a/core/math/rand/rand_xoshiro256.odin b/core/math/rand/rand_xoshiro256.odin new file mode 100644 index 00000000000..54dd02130b6 --- /dev/null +++ b/core/math/rand/rand_xoshiro256.odin @@ -0,0 +1,123 @@ +package rand + +import "base:intrinsics" +import "base:runtime" + +import "core:math/bits" + +/* +The state for a xoshiro256** pseudorandom generator. +*/ +Xoshiro256_Random_State :: struct { + s: [4]u64, +} + +xoshiro256_random_generator_proc :: proc(data: rawptr, mode: runtime.Random_Generator_Mode, p: []byte) { + @(require_results) + read_u64 :: proc "contextless" (r: ^Xoshiro256_Random_State) -> u64 { + // xoshiro256** output function and state transition + + result := bits.rotate_left64(r.s[1] * 5, 7) * 9 + t := r.s[1] << 17 + + r.s[2] = r.s[2] ~ r.s[0] + r.s[3] = r.s[3] ~ r.s[1] + r.s[1] = r.s[1] ~ r.s[2] + r.s[0] = r.s[0] ~ r.s[3] + r.s[2] = r.s[2] ~ t + r.s[3] = bits.rotate_left64(r.s[3], 45) + + return result + } + + @(thread_local) + global_rand_seed: Xoshiro256_Random_State + + init :: proc "contextless" (r: ^Xoshiro256_Random_State, seed: u64) { + // splitmix64 to expand a 64-bit seed into 256 bits of state + sm64_next :: proc "contextless" (s: ^u64) -> u64 { + s^ += 0x9E3779B97F4A7C15 + z := s^ + z = (z ~ (z >> 30)) * 0xBF58476D1CE4E5B9 + z = (z ~ (z >> 27)) * 0x94D049BB133111EB + return z ~ (z >> 31) + } + + local_seed := seed + r.s[0] = sm64_next(&local_seed) + r.s[1] = sm64_next(&local_seed) + r.s[2] = sm64_next(&local_seed) + r.s[3] = sm64_next(&local_seed) + // Extremely unlikely all zero; ensure non-zero state + if (r.s[0] | r.s[1] | r.s[2] | r.s[3]) == 0 { + // force a minimal non-zero tweak + r.s[0] = 1 + } + } + + r: ^Xoshiro256_Random_State = --- + if data == nil { + r = &global_rand_seed + } else { + r = cast(^Xoshiro256_Random_State)data + } + + switch mode { + case .Read: + if (r.s[0] | r.s[1] | r.s[2] | r.s[3]) == 0 { + init(r, u64(intrinsics.read_cycle_counter())) + } + + switch len(p) { + case size_of(u64): + // Fast path for a 64-bit destination. + intrinsics.unaligned_store((^u64)(raw_data(p)), read_u64(r)) + case: + // All other cases. + pos := i8(0) + val := u64(0) + for &v in p { + if pos == 0 { + val = read_u64(r) + pos = 8 + } + v = byte(val) + val >>= 8 + pos -= 1 + } + } + + case .Reset: + seed: u64 = 0 + runtime.mem_copy_non_overlapping(&seed, raw_data(p), min(size_of(seed), len(p))) + init(r, seed) + + case .Query_Info: + if len(p) != size_of(Generator_Query_Info) { + return + } + info := (^Generator_Query_Info)(raw_data(p)) + info^ += {.Uniform, .Resettable} + } +} + +/* +Returns an instance of the xoshiro256** pseudorandom generator. If no +initial state is provided, the PRNG will be lazily initialized with the +system timestamp counter on first-use. + +WARNING: This random number generator is NOT cryptographically secure. + +Inputs: +- state: Optional initial PRNG state. + +Returns: +- A `Generator` instance. +*/ +@(require_results) +xoshiro256_random_generator :: proc "contextless" (state: ^Xoshiro256_Random_State = nil) -> Generator { + return { + procedure = xoshiro256_random_generator_proc, + data = state, + } +} diff --git a/core/testing/runner.odin b/core/testing/runner.odin index 9ce4f35cb32..b6691dbf03d 100644 --- a/core/testing/runner.odin +++ b/core/testing/runner.odin @@ -151,9 +151,9 @@ run_test_task :: proc(task: thread.Task) { options = logger_options, } - random_generator_state: runtime.Default_Random_State + random_generator_state: rand.Xoshiro256_Random_State context.random_generator = { - procedure = runtime.default_random_generator_proc, + procedure = rand.xoshiro256_random_generator_proc, data = &random_generator_state, } rand.reset(data.t.seed) diff --git a/examples/demo/demo.odin b/examples/demo/demo.odin index 161d48acba3..1ea06d096e2 100644 --- a/examples/demo/demo.odin +++ b/examples/demo/demo.odin @@ -11,6 +11,7 @@ import "core:reflect" import "base:runtime" import "base:intrinsics" import "core:math/big" +import "core:math/rand" /* Odin is a general-purpose programming language with distinct typing built @@ -2258,6 +2259,10 @@ arbitrary_precision_mathematics :: proc() { a, b, c, d, e, f, res := &big.Int{}, &big.Int{}, &big.Int{}, &big.Int{}, &big.Int{}, &big.Int{}, &big.Int{} defer big.destroy(a, b, c, d, e, f, res) + // Set the context RNG to something that does not require + // cryptographic entropy (not supported on all targets). + context.random_generator = rand.xoshiro256_random_generator() + // How many bits should the random prime be? bits := 64 // Number of Rabin-Miller trials, -1 for automatic. diff --git a/tests/benchmark/all.odin b/tests/benchmark/all.odin index 30640ac87b5..042b2fa63d3 100644 --- a/tests/benchmark/all.odin +++ b/tests/benchmark/all.odin @@ -3,5 +3,6 @@ package benchmarks @(require) import "bytes" @(require) import "crypto" @(require) import "hash" +@(require) import "math" @(require) import "text/regex" @(require) import "strings" \ No newline at end of file diff --git a/tests/benchmark/math/benchmark_rand.odin b/tests/benchmark/math/benchmark_rand.odin new file mode 100644 index 00000000000..742c9059908 --- /dev/null +++ b/tests/benchmark/math/benchmark_rand.odin @@ -0,0 +1,130 @@ +package benchmark_core_math + +import "base:runtime" + +import "core:fmt" +import "core:math/rand" +import "core:log" +import "core:strings" +import "core:testing" +import "core:text/table" +import "core:time" + +@(private = "file") +ITERS :: 10000000 +@(private = "file") +ITERS_BULK :: 1000 + +@(private = "file") +SAMPLE_SEED : string : "ABCDEFGHIJKLMNOPQRSTUVWXYZ123456" + +@(test) +benchmark_rng :: proc(t: ^testing.T) { + runtime.DEFAULT_TEMP_ALLOCATOR_TEMP_GUARD() + + tbl: table.Table + table.init(&tbl) + defer table.destroy(&tbl) + + table.caption(&tbl, "RNG") + table.aligned_header_of_values(&tbl, .Right, "Algorithm", "Size", "Time", "Throughput") + + context.random_generator = rand.default_random_generator() + rand.reset_bytes(transmute([]byte)(SAMPLE_SEED)) + _benchmark_u64(t, &tbl, "chacha8rand") + _benchmark_large(t, &tbl, "chacha8rand") + + table.row(&tbl) + + context.random_generator = rand.pcg_random_generator() + _benchmark_u64(t, &tbl, "pcg64") + _benchmark_large(t, &tbl, "pcg64") + + table.row(&tbl) + + context.random_generator = rand.xoshiro256_random_generator() + _benchmark_u64(t, &tbl, "xorshiro256**") + _benchmark_large(t, &tbl, "xorshiro256**") + + log_table(&tbl) +} + +@(private = "file") +_benchmark_u64 :: proc(t: ^testing.T, tbl: ^table.Table, algo_name: string) { + options := &time.Benchmark_Options{ + rounds = ITERS, + bytes = 8, + setup = nil, + bench = proc(options: ^time.Benchmark_Options, allocator: runtime.Allocator) -> (err: time.Benchmark_Error){ + sum: u64 + for _ in 0 ..= options.rounds { + sum += rand.uint64() + } + options.hash = u128(sum) + options.count = options.rounds + options.processed = options.rounds * options.bytes + return + }, + teardown = nil, + } + + err := time.benchmark(options, context.allocator) + testing.expect(t, err == nil) + + time_per_iter := options.duration / ITERS + table.aligned_row_of_values( + tbl, + .Right, + algo_name, + table.format(tbl, "uint64"), + table.format(tbl, "%8M", time_per_iter), + table.format(tbl, "%5.3f MiB/s", options.megabytes_per_second), + ) +} + +@(private = "file") +_benchmark_large :: proc(t: ^testing.T, tbl: ^table.Table, algo_name: string) { + options := &time.Benchmark_Options{ + rounds = ITERS_BULK, + bytes = 1024768, + setup = nil, + bench = proc(options: ^time.Benchmark_Options, allocator: runtime.Allocator) -> (err: time.Benchmark_Error){ + n: int + for _ in 0 ..= options.rounds { + n += rand.read(options.output) + } + options.hash = u128(n) + options.count = options.rounds + options.processed = options.rounds * options.bytes + return + }, + output = make([]byte, 1024768, context.temp_allocator), + teardown = nil, + } + + err := time.benchmark(options, context.allocator) + testing.expect(t, err == nil) + + time_per_iter := options.duration / ITERS_BULK + table.aligned_row_of_values( + tbl, + .Right, + algo_name, + table.format(tbl, "1 MiB"), + table.format(tbl, "%8M", time_per_iter), + table.format(tbl, "%5.3f MiB/s", options.megabytes_per_second), + ) +} + +@(private) +log_table :: proc(tbl: ^table.Table) { + sb := strings.builder_make() + defer strings.builder_destroy(&sb) + + wr := strings.to_writer(&sb) + + fmt.sbprintln(&sb) + table.write_plain_table(wr, tbl) + + log.info(strings.to_string(sb)) +} diff --git a/tests/core/math/rand/test_core_math_rand.odin b/tests/core/math/rand/test_core_math_rand.odin index 392d3d2412d..814a1b9f8d6 100644 --- a/tests/core/math/rand/test_core_math_rand.odin +++ b/tests/core/math/rand/test_core_math_rand.odin @@ -1,19 +1,54 @@ package test_core_math_rand +import "core:math" import "core:math/rand" import "core:testing" -@test -test_default_rand_determinism :: proc(t: ^testing.T) { +Generator :: struct { + name: string, + gen: rand.Generator, + biased: bool, +} + +@(test) +test_prngs :: proc(t: ^testing.T) { + gens := []Generator { + { + "default", + rand.default_random_generator(), + false, + }, + { + "pcg64", + rand.pcg_random_generator(), // Deprecated + true, + }, + { + "xoshiro**", + rand.xoshiro256_random_generator(), + false, + }, + } + for gen in gens { + rand_determinism(t, gen) + if !gen.biased { + rand_issue_5881(t, gen) + } + } +} + +@(private = "file") +rand_determinism :: proc(t: ^testing.T, rng: Generator) { + context.random_generator = rng.gen rand.reset(13) first_value := rand.int127() rand.reset(13) second_value := rand.int127() - testing.expect(t, first_value == second_value, "Context default random number generator is non-deterministic.") + testing.expectf(t, first_value == second_value, "rng '%s' is non-deterministic.", rng.name) } -@test +@(test) test_default_rand_determinism_user_set :: proc(t: ^testing.T) { rng_state_1 := rand.create(13) rng_state_2 := rand.create(13) @@ -33,3 +68,80 @@ test_default_rand_determinism_user_set :: proc(t: ^testing.T) { testing.expect(t, first_value == second_value, "User-set default random number generator is non-deterministic.") } + +@(private = "file") +rand_issue_5881 :: proc(t:^testing.T, rng: Generator) { + // Tests issue #5881 https://github.com/odin-lang/Odin/issues/5881 + + // Bit balance and sign uniformity (modest samples to keep CI fast) + expect_u64_bit_balance(t, rng, 200_000) + expect_quaternion_sign_uniformity(t, rng, 200_000) +} + +// Helper: compute chi-square statistic for counts vs equal-expected across k bins +@(private = "file") +chi_square_equal :: proc(counts: []int) -> f64 { + n := 0 + for c in counts { + n += c + } + if n == 0 { + return 0 + } + k := len(counts) + exp := f64(n) / f64(k) + stat := f64(0) + for c in counts { + d := f64(c) - exp + stat += (d * d) / exp + } + return stat +} + +// Helper: check bit balance on u64 across many samples +@(private = "file") +expect_u64_bit_balance :: proc(t: ^testing.T, rng: Generator, samples: int, sigma_k: f64 = 6) { + rand.reset(t.seed, rng.gen) + + ones: [64]int + for i := 0; i < samples; i += 1 { + v := rand.uint64(rng.gen) + for b := 0; b < 64; b += 1 { + ones[b] += int((v >> u64(b)) & 1) + } + } + mu := f64(samples) * 0.5 + sigma := math.sqrt(f64(samples) * 0.25) + limit := sigma_k * sigma + for b := 0; b < 64; b += 1 { + diff := math.abs(f64(ones[b]) - mu) + if diff > limit { + testing.expectf(t, false, "rng '%s': u64 bit %d imbalance: ones=%d samples=%d diff=%.1f limit=%.1f", rng.name, b, ones[b], samples, diff, limit) + return + } + } +} + +// Helper: Uniformity sanity via 4D sign orthant chi-square with modest sample size. +@(private = "file") +expect_quaternion_sign_uniformity :: proc(t: ^testing.T, rng: Generator, iterations: int) { + counts: [16]int + for _ in 0..= 0 { idx |= 1 } + if y >= 0 { idx |= 2 } + if z >= 0 { idx |= 4 } + if w >= 0 { idx |= 8 } + counts[idx] += 1 + } + // df = 15. For a modest sample size, use a generous cutoff to reduce flakiness. + // Chi-square critical values (df=15): p=0.001 -> ~37.7, p=0.0001 -> ~43.8 + // We accept < 55 as a conservative stability bound across platforms. + chi := chi_square_equal(counts[:]) + testing.expectf(t, chi < 55.0, "rng '%s': 4D sign chi-square too high: %.3f (counts=%v)", rng.name, chi, counts) +} diff --git a/tests/internal/test_chacha8rand.odin b/tests/internal/test_chacha8rand.odin new file mode 100644 index 00000000000..378b398f07b --- /dev/null +++ b/tests/internal/test_chacha8rand.odin @@ -0,0 +1,151 @@ +package test_internal + +import "base:runtime" +import "core:bytes" +import "core:encoding/endian" +import "core:math/rand" +import "core:testing" + +@(private = "file") +ITERS :: 10000000 +@(private = "file") +ITERS_BULK :: 1000 + +@(private = "file") +SAMPLE_SEED : string : "ABCDEFGHIJKLMNOPQRSTUVWXYZ123456" +@(private = "file") +SAMPLE_OUTPUT := []u64{ + 0xb773b6063d4616a5, 0x1160af22a66abc3c, 0x8c2599d9418d287c, 0x7ee07e037edc5cd6, + 0xcfaa9ee02d1c16ad, 0x0e090eef8febea79, 0x3c82d271128b5b3e, 0x9c5addc11252a34f, + 0xdf79bb617d6ceea6, 0x36d553591f9d736a, 0xeef0d14e181ee01f, 0x089bfc760ae58436, + 0xd9e52b59cc2ad268, 0xeb2fb4444b1b8aba, 0x4f95c8a692c46661, 0xc3c6323217cae62c, + 0x91ebb4367f4e2e7e, 0x784cf2c6a0ec9bc6, 0x5c34ec5c34eabe20, 0x4f0a8f515570daa8, + 0xfc35dcb4113d6bf2, 0x5b0da44c645554bc, 0x6d963da3db21d9e1, 0xeeaefc3150e500f3, + 0x2d37923dda3750a5, 0x380d7a626d4bc8b0, 0xeeaf68ede3d7ee49, 0xf4356695883b717c, + 0x846a9021392495a4, 0x8e8510549630a61b, 0x18dc02545dbae493, 0x0f8f9ff0a65a3d43, + 0xccf065f7190ff080, 0xfd76d1aa39673330, 0x95d232936cba6433, 0x6c7456d1070cbd17, + 0x462acfdaff8c6562, 0x5bafab866d34fc6a, 0x0c862f78030a2988, 0xd39a83e407c3163d, + 0xc00a2b7b45f22ebf, 0x564307c62466b1a9, 0x257e0424b0c072d4, 0x6fb55e99496c28fe, + 0xae9873a88f5cd4e0, 0x4657362ac60d3773, 0x1c83f91ecdf23e8e, 0x6fdc0792c15387c0, + 0x36dad2a30dfd2b5c, 0xa4b593290595bdb7, 0x4de18934e4cc02c5, 0xcdc0d604f015e3a7, + 0xfba0dbf69ad80321, 0x60e8bea3d139de87, 0xd18a4d851ef48756, 0x6366447c2215f34a, + 0x05682e97d3d007ee, 0x4c0e8978c6d54ab2, 0xcf1e9f6a6712edc2, 0x061439414c80cfd3, + 0xd1a8b6e2745c0ead, 0x31a7918d45c410e8, 0xabcc61ad90216eec, 0x4040d92d2032a71a, + 0x3cd2f66ffb40cd68, 0xdcd051c07295857a, 0xeab55cbcd9ab527e, 0x18471dce781bdaac, + 0xf7f08cd144dc7252, 0x5804e0b13d7f40d1, 0x5cb1a446e4b2d35b, 0xe6d4a728d2138a06, + 0x05223e40ca60dad8, 0x2d61ec3206ac6a68, 0xab692356874c17b8, 0xc30954417676de1c, + 0x4f1ace3732225624, 0xfba9510813988338, 0x997f200f52752e11, 0x1116aaafe86221fa, + 0x07ce3b5cb2a13519, 0x2956bc72bc458314, 0x4188b7926140eb78, 0x56ca6dbfd4adea4d, + 0x7fe3c22349340ce5, 0x35c08f9c37675f8a, 0x11e1c7fbef5ed521, 0x98adc8464ec1bc75, + 0xd163b2c73d1203f8, 0x8c761ee043a2f3f3, 0x24b99d6accecd7b7, 0x793e31aa112f0370, + 0x8e87dc2a19285139, 0x4247ae04f7096e25, 0x514f3122926fe20f, 0xdc6fb3f045d2a7e9, + 0x15cb30cecdd18eba, 0xcbc7fdecf6900274, 0x3fb5c696dc8ba021, 0xd1664417c8d274e6, + 0x05f7e445ea457278, 0xf920bbca1b9db657, 0x0c1950b4da22cb99, 0xf875baf1af09e292, + 0xbed3d7b84250f838, 0xf198e8080fd74160, 0xc9eda51d9b7ea703, 0xf709ef55439bf8f6, + 0xd20c74feebf116fc, 0x305668eb146d7546, 0x829af3ec10d89787, 0x15b8f9697b551dbc, + 0xfc823c6c8e64b8c9, 0x345585e8183b40bc, 0x674b4171d6581368, 0x1234d81cd670e9f7, + 0x0e505210d8a55e19, 0xe8258d69eeeca0dc, 0x05d4c452e8baf67e, 0xe8dbe30116a45599, + 0x1cf08ce1b1176f00, 0xccf7d0a4b81ecb49, 0x303fea136b2c430e, 0x861d6c139c06c871, + 0x5f41df72e05e0487, 0x25bd7e1e1ae26b1d, 0xbe9f4004d662a41d, 0x65bf58d483188546, + 0xd1b27cff69db13cc, 0x01a6663372c1bb36, 0x578dd7577b727f4d, 0x19c78f066c083cf6, + 0xdbe014d4f9c391bb, 0x97fbb2dd1d13ffb3, 0x31c91e0af9ef8d4f, 0x094dfc98402a43ba, + 0x069bd61bea37b752, 0x5b72d762e8d986ca, 0x72ee31865904bc85, 0xd1f5fdc5cd36c33e, + 0xba9b4980a8947cad, 0xece8f05eac49ab43, 0x65fe1184abae38e7, 0x2d7cb9dea5d31452, + 0xcc71489476e467e3, 0x4c03a258a578c68c, 0x00efdf9ecb0fd8fc, 0x9924cad471e2666d, + 0x87f8668318f765e9, 0xcb4dc57c1b55f5d8, 0xd373835a86604859, 0xe526568b5540e482, + 0x1f39040f08586fec, 0xb764f3f00293f8e6, 0x049443a2f6bd50a8, 0x76fec88697d3941a, + 0x3efb70d039bae7a2, 0xe2f4611368eca8a8, 0x7c007a96e01d2425, 0xbbcce5768e69c5bf, + 0x784fb4985c42aac3, 0xf72b5091aa223874, 0x3630333fb1e62e07, 0x8e7319ebdebbb8de, + 0x2a3982bca959fa00, 0xb2b98b9f964ba9b3, 0xf7e31014adb71951, 0xebd0fca3703acc82, + 0xec654e2a2fe6419a, 0xb326132d55a52e2c, 0x2248c57f44502978, 0x32710c2f342daf16, + 0x0517b47b5acb2bec, 0x4c7a718fca270937, 0xd69142bed0bcc541, 0xe40ebcb8ff52ce88, + 0x3e44a2dbc9f828d4, 0xc74c2f4f8f873f58, 0x3dbf648eb799e45b, 0x33f22475ee0e86f8, + 0x1eb4f9ee16d47f65, 0x40f8d2b8712744e3, 0xb886b4da3cb14572, 0x2086326fbdd6f64d, + 0xcc3de5907dd882b9, 0xa2e8b49a5ee909df, 0xdbfb8e7823964c10, 0x70dd6089ef0df8d5, + 0x30141663cdd9c99f, 0x04b805325c240365, 0x7483d80314ac12d6, 0x2b271cb91aa7f5f9, + 0x97e2245362abddf0, 0x5a84f614232a9fab, 0xf71125fcda4b7fa2, 0x1ca5a61d74b27267, + 0x38cc6a9b3adbcb45, 0xdde1bb85dc653e39, 0xe9d0c8fa64f89fd4, 0x02c5fb1ecd2b4188, + 0xf2bd137bca5756e5, 0xadefe25d121be155, 0x56cd1c3c5d893a8e, 0x4c50d337beb65bb9, + 0x918c5151675cf567, 0xaba649ffcfb56a1e, 0x20c74ab26a2247cd, 0x71166bac853c08da, + 0xb07befe2e584fc5d, 0xda45ff2a588dbf32, 0xdb98b03c4d75095e, 0x60285ae1aaa65a4c, + 0xf93b686a263140b8, 0xde469752ee1c180e, 0xcec232dc04129aae, 0xeb916baa1835ea04, + 0xd49c21c8b64388ff, 0x72a82d9658864888, 0x003348ef7eac66a8, 0x7f6f67e655b209eb, + 0x532ffb0b7a941b25, 0xd940ade6128deede, 0xdf24f2a1af89fe23, 0x95aa3b4988195ae0, + 0x3da649404f94be4a, 0x692dad132c3f7e27, 0x40aee76ecaaa9eb8, 0x1294a01e09655024, + 0x6df797abdba4e4f5, 0xea2fb6024c1d7032, 0x5f4e0492295489fc, 0x57972914ea22e06a, + 0x9a8137d133aad473, 0xa2e6dd6ae7cdf2f3, 0x9f42644f18086647, 0x16d03301c170bd3e, + 0x908c416fa546656d, 0xe081503be22e123e, 0x077cf09116c4cc72, 0xcbd25cd264b7f229, + 0x3db2f468ec594031, 0x46c00e734c9badd5, 0xd0ec0ac72075d861, 0x3037cb3cf80b7630, + 0x574c3d7b3a2721c6, 0xae99906a0076824b, 0xb175a5418b532e70, 0xd8b3e251ee231ddd, + 0xb433eec25dca1966, 0x530f30dc5cff9a93, 0x9ff03d98b53cd335, 0xafc4225076558cdf, + 0xef81d3a28284402a, 0x110bdbf51c110a28, 0x9ae1b255d027e8f6, 0x7de3e0aa24688332, + 0xe483c3ecd2067ee2, 0xf829328b276137e6, 0xa413ccad57562cad, 0xe6118e8b496acb1f, + 0x8288dca6da5ec01f, 0xa53777dc88c17255, 0x8a00f1e0d5716eda, 0x618e6f47b7a720a8, + 0x9e3907b0c692a841, 0x978b42ca963f34f3, 0x75e4b0cd98a7d7ef, 0xde4dbd6e0b5f4752, + 0x0252e4153f34493f, 0x50f0e7d803734ef9, 0x237766a38ed167ee, 0x4124414001ee39a0, + 0xd08df643e535bb21, 0x34f575b5a9a80b74, 0x2c343af87297f755, 0xcd8b6d99d821f7cb, + 0xe376fd7256fc48ae, 0xe1b06e7334352885, 0xfa87b26f86c169eb, 0x36c1604665a971de, + 0xdba147c2239c8e80, 0x6b208e69fc7f0e24, 0x8795395b6f2b60c3, 0x05dabee9194907f4, + 0xb98175142f5ed902, 0x5e1701e2021ddc81, 0x0875aba2755eed08, 0x778d83289251de95, + 0x3bfbe46a039ecb31, 0xb24704fce4cbd7f9, 0x6985ffe9a7c91e3d, 0xc8efb13df249dabb, + 0xb1037e64b0f4c9f6, 0x55f69fd197d6b7c3, 0x672589d71d68a90c, 0xbebdb8224f50a77e, + 0x3f589f80007374a7, 0xd307f4635954182a, 0xcff5850c10d4fd90, 0xc6da02dfb6408e15, + 0x93daeef1e2b1a485, 0x65d833208aeea625, 0xe2b13fa13ed3b5fa, 0x67053538130fb68e, + 0xc1042f6598218fa9, 0xee5badca749b8a2e, 0x6d22a3f947dae37d, 0xb62c6d1657f4dbaf, + 0x6e007de69704c20b, 0x1af2b913fc3841d8, 0xdc0e47348e2e8e22, 0x9b1ddef1cf958b22, + 0x632ed6b0233066b8, 0xddd02d3311bed8f2, 0xf147cfe1834656e9, 0x399aaa49d511597a, + 0x6b14886979ec0309, 0x64fc4ac36b5afb97, 0xb82f78e07f7cf081, 0x10925c9a323d0e1b, + 0xf451c79ee13c63f6, 0x7c2fc180317876c7, 0x35a12bd9eecb7d22, 0x335654a539621f90, + 0xcc32a3f35db581f0, 0xc60748a80b2369cb, 0x7c4dd3b08591156b, 0xac1ced4b6de22291, + 0xa32cfa2df134def5, 0x627108918dea2a53, 0x0555b1608fcb4ff4, 0x143ee7ac43aaa33c, + 0xdae90ce7cf4fc218, 0x4d68fc2582bcf4b5, 0x37094e1849135d71, 0xf7857e09f3d49fd8, + 0x007538c503768be7, 0xedf648ba2f6be601, 0xaa347664dd72513e, 0xbe63893c6ef23b86, + 0x130b85710605af97, 0xdd765c6b1ef6ab56, 0xf3249a629a97dc6b, 0x2a114f9020fab8e5, + 0x5a69e027cfc6ad08, 0x3c4ccb36f1a5e050, 0x2e9e7d596834f0a5, 0x2430be6858fce789, + 0xe90b862f2466e597, 0x895e2884f159a9ec, 0x26ab8fa4902fcb57, 0xa6efff5c54e1fa50, + 0x333ac4e5811a8255, 0xa58d515f02498611, 0xfe5a09dcb25c6ef4, 0x03898988ab5f5818, + 0x289ff6242af6c617, 0x3d9dd59fd381ea23, 0x52d7d93d8a8aae51, 0xc76a123d511f786f, + 0xf68901edaf00c46c, 0x8c630871b590de80, 0x05209c308991e091, 0x1f809f99b4788177, + 0x11170c2eb6c19fd8, 0x44433c779062ba58, 0xc0acb51af1874c45, 0x9f2e134284809fa1, + 0xedb523bd15c619fa, 0x02d97fd53ecc23c0, 0xacaf05a34462374c, 0xddd9c6d34bffa11f, +} + +@(test) +chacha8rand_u64s :: proc(t: ^testing.T) { + st: runtime.Default_Random_State + context.random_generator = runtime.default_random_generator(&st) + rand.reset_bytes(transmute([]byte)(SAMPLE_SEED)) + + for expected, i in SAMPLE_OUTPUT { + actual := rand.uint64() + testing.expectf(t, expected == actual, "[%d]: got %x (expected %x)", i, actual, expected) + } +} + +@(test) +chacha8rand_bytes :: proc(t: ^testing.T) { + st: runtime.Default_Random_State + context.random_generator = runtime.default_random_generator(&st) + rand.reset_bytes(transmute([]byte)(SAMPLE_SEED)) + + // Test a massive bulk read. + buf := make([]byte, len(SAMPLE_OUTPUT) * size_of(u64), context.temp_allocator) + n := rand.read(buf) + testing.expectf(t, n == len(buf), "insufficient output: got %d (expected %d)", n, len(buf)) + + for expected, i in SAMPLE_OUTPUT { + actual, _ := endian.get_u64(buf[i*8:], .Little) + testing.expectf(t, expected == actual, "[%d]: got %x (expected %x)", i, actual, expected) + } + + // Test that the internal state always advances by a multiple of + // 8-bytes. + rand.reset_bytes(transmute([]byte)(SAMPLE_SEED)) + tmp: [8]byte + off: int + for i := 1; i < 8; i += 1 { + _ = rand.read(tmp[:i]) + testing.expect(t, bytes.equal(tmp[:i], buf[off:off+i])) + off += 8 + } +}