From fe238e49a0b00bb3c4423a30f381154549278fd7 Mon Sep 17 00:00:00 2001 From: Jack O'Connor Date: Sun, 21 Jan 2024 21:15:45 -0800 Subject: [PATCH] factor out the `blake3` crate changes from the guts_api branch This commit and the branch that it starts are unlikely to land as-is, but I want to maintain them while I flesh out the new `blake3_guts` sub-crate. --- Cargo.toml | 2 +- b3sum/Cargo.lock | 12 +- b3sum/src/main.rs | 2 +- benches/bench.rs | 175 +------------ src/lib.rs | 551 ++++++++++++---------------------------- src/portable.rs | 8 +- src/test.rs | 407 ++++++++++++----------------- test_vectors/Cargo.toml | 1 + test_vectors/src/lib.rs | 2 +- 9 files changed, 337 insertions(+), 823 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 591a6e32a..fe7343649 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -92,8 +92,8 @@ no_neon = [] features = ["mmap", "rayon", "serde", "zeroize"] [dependencies] -arrayref = "0.3.5" arrayvec = { version = "0.7.4", default-features = false } +blake3_guts = { path = "rust/guts" } constant_time_eq = "0.3.0" cfg-if = "1.0.0" digest = { version = "0.10.1", features = [ "mac" ], optional = true } diff --git a/b3sum/Cargo.lock b/b3sum/Cargo.lock index 10caffb4a..7bab0b05c 100644 --- a/b3sum/Cargo.lock +++ b/b3sum/Cargo.lock @@ -56,12 +56,6 @@ version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" -[[package]] -name = "arrayref" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" - [[package]] name = "arrayvec" version = "0.7.4" @@ -98,8 +92,8 @@ checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" name = "blake3" version = "1.5.0" dependencies = [ - "arrayref", "arrayvec", + "blake3_guts", "cc", "cfg-if", "constant_time_eq", @@ -107,6 +101,10 @@ dependencies = [ "rayon", ] +[[package]] +name = "blake3_guts" +version = "0.0.0" + [[package]] name = "cc" version = "1.0.83" diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs index 228737ff0..3dadc002e 100644 --- a/b3sum/src/main.rs +++ b/b3sum/src/main.rs @@ -186,7 +186,7 @@ fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()> // TODO: This computes each output block twice when the --seek argument isn't a multiple of 64. // We'll refactor all of this soon anyway, once SIMD optimizations are available for the XOF. let mut len = args.len(); - let mut block = [0; blake3::guts::BLOCK_LEN]; + let mut block = [0; 64]; while len > 0 { output.fill(&mut block); let hex_str = hex::encode(&block[..]); diff --git a/benches/bench.rs b/benches/bench.rs index 5efb9e6f5..e057d24e1 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -2,11 +2,7 @@ extern crate test; -use arrayref::array_ref; -use arrayvec::ArrayVec; -use blake3::guts::{BLOCK_LEN, CHUNK_LEN}; -use blake3::platform::{Platform, MAX_SIMD_DEGREE}; -use blake3::OUT_LEN; +use blake3_guts::BLOCK_LEN; use rand::prelude::*; use test::Bencher; @@ -49,175 +45,6 @@ impl RandomInput { } } -fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) { - let mut state = [1u32; 8]; - let mut r = RandomInput::new(b, 64); - let input = array_ref!(r.get(), 0, 64); - b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0)); -} - -#[bench] -fn bench_single_compression_portable(b: &mut Bencher) { - bench_single_compression_fn(b, Platform::portable()); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_single_compression_sse2(b: &mut Bencher) { - if let Some(platform) = Platform::sse2() { - bench_single_compression_fn(b, platform); - } -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_single_compression_sse41(b: &mut Bencher) { - if let Some(platform) = Platform::sse41() { - bench_single_compression_fn(b, platform); - } -} - -#[bench] -#[cfg(blake3_avx512_ffi)] -fn bench_single_compression_avx512(b: &mut Bencher) { - if let Some(platform) = Platform::avx512() { - bench_single_compression_fn(b, platform); - } -} - -fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) { - let degree = platform.simd_degree(); - let mut inputs = Vec::new(); - for _ in 0..degree { - inputs.push(RandomInput::new(b, CHUNK_LEN)); - } - b.iter(|| { - let input_arrays: ArrayVec<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE> = inputs - .iter_mut() - .take(degree) - .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) - .collect(); - let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; - platform.hash_many( - &input_arrays[..], - &[0; 8], - 0, - blake3::IncrementCounter::Yes, - 0, - 0, - 0, - &mut out, - ); - }); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_chunks_sse2(b: &mut Bencher) { - if let Some(platform) = Platform::sse2() { - bench_many_chunks_fn(b, platform); - } -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_chunks_sse41(b: &mut Bencher) { - if let Some(platform) = Platform::sse41() { - bench_many_chunks_fn(b, platform); - } -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_chunks_avx2(b: &mut Bencher) { - if let Some(platform) = Platform::avx2() { - bench_many_chunks_fn(b, platform); - } -} - -#[bench] -#[cfg(blake3_avx512_ffi)] -fn bench_many_chunks_avx512(b: &mut Bencher) { - if let Some(platform) = Platform::avx512() { - bench_many_chunks_fn(b, platform); - } -} - -#[bench] -#[cfg(feature = "neon")] -fn bench_many_chunks_neon(b: &mut Bencher) { - if let Some(platform) = Platform::neon() { - bench_many_chunks_fn(b, platform); - } -} - -// TODO: When we get const generics we can unify this with the chunks code. -fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) { - let degree = platform.simd_degree(); - let mut inputs = Vec::new(); - for _ in 0..degree { - inputs.push(RandomInput::new(b, BLOCK_LEN)); - } - b.iter(|| { - let input_arrays: ArrayVec<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE> = inputs - .iter_mut() - .take(degree) - .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) - .collect(); - let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; - platform.hash_many( - &input_arrays[..], - &[0; 8], - 0, - blake3::IncrementCounter::No, - 0, - 0, - 0, - &mut out, - ); - }); -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_parents_sse2(b: &mut Bencher) { - if let Some(platform) = Platform::sse2() { - bench_many_parents_fn(b, platform); - } -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_parents_sse41(b: &mut Bencher) { - if let Some(platform) = Platform::sse41() { - bench_many_parents_fn(b, platform); - } -} - -#[bench] -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -fn bench_many_parents_avx2(b: &mut Bencher) { - if let Some(platform) = Platform::avx2() { - bench_many_parents_fn(b, platform); - } -} - -#[bench] -#[cfg(blake3_avx512_ffi)] -fn bench_many_parents_avx512(b: &mut Bencher) { - if let Some(platform) = Platform::avx512() { - bench_many_parents_fn(b, platform); - } -} - -#[bench] -#[cfg(feature = "neon")] -fn bench_many_parents_neon(b: &mut Bencher) { - if let Some(platform) = Platform::neon() { - bench_many_parents_fn(b, platform); - } -} - fn bench_atonce(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| blake3::hash(input.get())); diff --git a/src/lib.rs b/src/lib.rs index 1fe47bf54..efc1fc4ee 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,109 +88,29 @@ #[cfg(test)] mod test; -// The guts module is for incremental use cases like the `bao` crate that need -// to explicitly compute chunk and parent chaining values. It is semi-stable -// and likely to keep working, but largely undocumented and not intended for -// widespread use. -#[doc(hidden)] -pub mod guts; - -/// Undocumented and unstable, for benchmarks only. -#[doc(hidden)] -pub mod platform; - -// Platform-specific implementations of the compression function. These -// BLAKE3-specific cfg flags are set in build.rs. -#[cfg(blake3_avx2_rust)] -#[path = "rust_avx2.rs"] -mod avx2; -#[cfg(blake3_avx2_ffi)] -#[path = "ffi_avx2.rs"] -mod avx2; -#[cfg(blake3_avx512_ffi)] -#[path = "ffi_avx512.rs"] -mod avx512; -#[cfg(blake3_neon)] -#[path = "ffi_neon.rs"] -mod neon; -mod portable; -#[cfg(blake3_sse2_rust)] -#[path = "rust_sse2.rs"] -mod sse2; -#[cfg(blake3_sse2_ffi)] -#[path = "ffi_sse2.rs"] -mod sse2; -#[cfg(blake3_sse41_rust)] -#[path = "rust_sse41.rs"] -mod sse41; -#[cfg(blake3_sse41_ffi)] -#[path = "ffi_sse41.rs"] -mod sse41; - #[cfg(feature = "traits-preview")] pub mod traits; mod io; mod join; -use arrayref::{array_mut_ref, array_ref}; use arrayvec::{ArrayString, ArrayVec}; use core::cmp; use core::fmt; -use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2}; -/// The number of bytes in a [`Hash`](struct.Hash.html), 32. +use blake3_guts as guts; +use guts::{ + BlockBytes, CVBytes, BLOCK_LEN, CHUNK_END, CHUNK_LEN, CHUNK_START, DERIVE_KEY_CONTEXT, + DERIVE_KEY_MATERIAL, IV_BYTES, KEYED_HASH, PARENT, ROOT, +}; + +/// The number of bytes in a [`Hash`](struct.Hash.html), 32 pub const OUT_LEN: usize = 32; -/// The number of bytes in a key, 32. +/// The number of bytes in a key, 32 pub const KEY_LEN: usize = 32; const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64 -use guts::{BLOCK_LEN, CHUNK_LEN}; - -// While iterating the compression function within a chunk, the CV is -// represented as words, to avoid doing two extra endianness conversions for -// each compression in the portable implementation. But the hash_many interface -// needs to hash both input bytes and parent nodes, so its better for its -// output CVs to be represented as bytes. -type CVWords = [u32; 8]; -type CVBytes = [u8; 32]; // little-endian - -const IV: &CVWords = &[ - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, -]; - -const MSG_SCHEDULE: [[usize; 16]; 7] = [ - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], - [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], - [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], - [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], - [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], - [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], -]; - -// These are the internal flags that we use to domain separate root/non-root, -// chunk/parent, and chunk beginning/middle/end. These get set at the high end -// of the block flags word in the compression function, so their values start -// high and go down. -const CHUNK_START: u8 = 1 << 0; -const CHUNK_END: u8 = 1 << 1; -const PARENT: u8 = 1 << 2; -const ROOT: u8 = 1 << 3; -const KEYED_HASH: u8 = 1 << 4; -const DERIVE_KEY_CONTEXT: u8 = 1 << 5; -const DERIVE_KEY_MATERIAL: u8 = 1 << 6; - -#[inline] -fn counter_low(counter: u64) -> u32 { - counter as u32 -} - -#[inline] -fn counter_high(counter: u64) -> u32 { - (counter >> 32) as u32 -} /// An output of the default size, 32 bytes, which provides constant-time /// equality checking. @@ -219,19 +139,19 @@ fn counter_high(counter: u64) -> u32 { #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] #[derive(Clone, Copy, Hash)] -pub struct Hash([u8; OUT_LEN]); +pub struct Hash(CVBytes); impl Hash { /// The raw bytes of the `Hash`. Note that byte arrays don't provide /// constant-time equality checking, so if you need to compare hashes, /// prefer the `Hash` type. #[inline] - pub const fn as_bytes(&self) -> &[u8; OUT_LEN] { + pub const fn as_bytes(&self) -> &CVBytes { &self.0 } /// Create a `Hash` from its raw bytes representation. - pub const fn from_bytes(bytes: [u8; OUT_LEN]) -> Self { + pub const fn from_bytes(bytes: CVBytes) -> Self { Self(bytes) } @@ -275,7 +195,7 @@ impl Hash { if hex_bytes.len() != OUT_LEN * 2 { return Err(HexError(HexErrorInner::InvalidLen(hex_bytes.len()))); } - let mut hash_bytes: [u8; OUT_LEN] = [0; OUT_LEN]; + let mut hash_bytes: CVBytes = [0; OUT_LEN]; for i in 0..OUT_LEN { hash_bytes[i] = 16 * hex_val(hex_bytes[2 * i])? + hex_val(hex_bytes[2 * i + 1])?; } @@ -283,14 +203,14 @@ impl Hash { } } -impl From<[u8; OUT_LEN]> for Hash { +impl From for Hash { #[inline] - fn from(bytes: [u8; OUT_LEN]) -> Self { + fn from(bytes: CVBytes) -> Self { Self::from_bytes(bytes) } } -impl From for [u8; OUT_LEN] { +impl From for CVBytes { #[inline] fn from(hash: Hash) -> Self { hash.0 @@ -314,9 +234,9 @@ impl PartialEq for Hash { } /// This implementation is constant-time. -impl PartialEq<[u8; OUT_LEN]> for Hash { +impl PartialEq for Hash { #[inline] - fn eq(&self, other: &[u8; OUT_LEN]) -> bool { + fn eq(&self, other: &CVBytes) -> bool { constant_time_eq::constant_time_eq_32(&self.0, other) } } @@ -395,70 +315,56 @@ impl std::error::Error for HexError {} #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] #[derive(Clone)] struct Output { - input_chaining_value: CVWords, - block: [u8; 64], + input_chaining_value: CVBytes, + block: BlockBytes, block_len: u8, counter: u64, flags: u8, - #[cfg_attr(feature = "zeroize", zeroize(skip))] - platform: Platform, } impl Output { fn chaining_value(&self) -> CVBytes { - let mut cv = self.input_chaining_value; - self.platform.compress_in_place( - &mut cv, + guts::DETECTED_IMPL.compress( &self.block, - self.block_len, + self.block_len as u32, + &self.input_chaining_value, self.counter, - self.flags, - ); - platform::le_bytes_from_words_32(&cv) + self.flags as u32, + ) } fn root_hash(&self) -> Hash { debug_assert_eq!(self.counter, 0); - let mut cv = self.input_chaining_value; - self.platform - .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); - Hash(platform::le_bytes_from_words_32(&cv)) - } - - fn root_output_block(&self) -> [u8; 2 * OUT_LEN] { - self.platform.compress_xof( - &self.input_chaining_value, + Hash(guts::DETECTED_IMPL.compress( &self.block, - self.block_len, - self.counter, - self.flags | ROOT, - ) + self.block_len as u32, + &self.input_chaining_value, + 0, + self.flags as u32 | ROOT, + )) } } #[derive(Clone)] #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] struct ChunkState { - cv: CVWords, + cv: CVBytes, chunk_counter: u64, - buf: [u8; BLOCK_LEN], + buf: BlockBytes, buf_len: u8, blocks_compressed: u8, flags: u8, - #[cfg_attr(feature = "zeroize", zeroize(skip))] - platform: Platform, } impl ChunkState { - fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self { + fn new(key: &CVBytes, chunk_counter: u64, flags: u32) -> Self { Self { cv: *key, chunk_counter, buf: [0; BLOCK_LEN], buf_len: 0, blocks_compressed: 0, - flags, - platform, + flags: flags as u8, } } @@ -474,7 +380,7 @@ impl ChunkState { *input = &input[take..]; } - fn start_flag(&self) -> u8 { + fn start_flag(&self) -> u32 { if self.blocks_compressed == 0 { CHUNK_START } else { @@ -489,13 +395,12 @@ impl ChunkState { self.fill_buf(&mut input); if !input.is_empty() { debug_assert_eq!(self.buf_len as usize, BLOCK_LEN); - let block_flags = self.flags | self.start_flag(); // borrowck - self.platform.compress_in_place( - &mut self.cv, + self.cv = guts::DETECTED_IMPL.compress( &self.buf, - BLOCK_LEN as u8, + BLOCK_LEN as u32, + &self.cv, self.chunk_counter, - block_flags, + self.flags as u32 | self.start_flag(), ); self.buf_len = 0; self.buf = [0; BLOCK_LEN]; @@ -505,13 +410,12 @@ impl ChunkState { while input.len() > BLOCK_LEN { debug_assert_eq!(self.buf_len, 0); - let block_flags = self.flags | self.start_flag(); // borrowck - self.platform.compress_in_place( - &mut self.cv, - array_ref!(input, 0, BLOCK_LEN), - BLOCK_LEN as u8, + self.cv = guts::DETECTED_IMPL.compress( + input[..BLOCK_LEN].try_into().unwrap(), + BLOCK_LEN as u32, + &self.cv, self.chunk_counter, - block_flags, + self.flags as u32 | self.start_flag(), ); self.blocks_compressed += 1; input = &input[BLOCK_LEN..]; @@ -524,14 +428,12 @@ impl ChunkState { } fn output(&self) -> Output { - let block_flags = self.flags | self.start_flag() | CHUNK_END; Output { input_chaining_value: self.cv, block: self.buf, block_len: self.buf_len, counter: self.chunk_counter, - flags: block_flags, - platform: self.platform, + flags: self.flags | self.start_flag() as u8 | CHUNK_END as u8, } } } @@ -543,7 +445,6 @@ impl fmt::Debug for ChunkState { .field("len", &self.len()) .field("chunk_counter", &self.chunk_counter) .field("flags", &self.flags) - .field("platform", &self.platform) .finish() } } @@ -563,131 +464,6 @@ impl fmt::Debug for ChunkState { // use full-width SIMD vectors for parent hashing. Without parallel parent // hashing, we lose about 10% of overall throughput on AVX2 and AVX-512. -/// Undocumented and unstable, for benchmarks only. -#[doc(hidden)] -#[derive(Clone, Copy)] -pub enum IncrementCounter { - Yes, - No, -} - -impl IncrementCounter { - #[inline] - fn yes(&self) -> bool { - match self { - IncrementCounter::Yes => true, - IncrementCounter::No => false, - } - } -} - -// The largest power of two less than or equal to `n`, used for left_len() -// immediately below, and also directly in Hasher::update(). -fn largest_power_of_two_leq(n: usize) -> usize { - ((n / 2) + 1).next_power_of_two() -} - -// Given some input larger than one chunk, return the number of bytes that -// should go in the left subtree. This is the largest power-of-2 number of -// chunks that leaves at least 1 byte for the right subtree. -fn left_len(content_len: usize) -> usize { - debug_assert!(content_len > CHUNK_LEN); - // Subtract 1 to reserve at least one byte for the right side. - let full_chunks = (content_len - 1) / CHUNK_LEN; - largest_power_of_two_leq(full_chunks) * CHUNK_LEN -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time -// on a single thread. Write out the chunk chaining values and return the -// number of chunks hashed. These chunks are never the root and never empty; -// those cases use a different codepath. -fn compress_chunks_parallel( - input: &[u8], - key: &CVWords, - chunk_counter: u64, - flags: u8, - platform: Platform, - out: &mut [u8], -) -> usize { - debug_assert!(!input.is_empty(), "empty chunks below the root"); - debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN); - - let mut chunks_exact = input.chunks_exact(CHUNK_LEN); - let mut chunks_array = ArrayVec::<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE>::new(); - for chunk in &mut chunks_exact { - chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN)); - } - platform.hash_many( - &chunks_array, - key, - chunk_counter, - IncrementCounter::Yes, - flags, - CHUNK_START, - CHUNK_END, - out, - ); - - // Hash the remaining partial chunk, if there is one. Note that the empty - // chunk (meaning the empty message) is a different codepath. - let chunks_so_far = chunks_array.len(); - if !chunks_exact.remainder().is_empty() { - let counter = chunk_counter + chunks_so_far as u64; - let mut chunk_state = ChunkState::new(key, counter, flags, platform); - chunk_state.update(chunks_exact.remainder()); - *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) = - chunk_state.output().chaining_value(); - chunks_so_far + 1 - } else { - chunks_so_far - } -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time -// on a single thread. Write out the parent chaining values and return the -// number of parents hashed. (If there's an odd input chaining value left over, -// return it as an additional output.) These parents are never the root and -// never empty; those cases use a different codepath. -fn compress_parents_parallel( - child_chaining_values: &[u8], - key: &CVWords, - flags: u8, - platform: Platform, - out: &mut [u8], -) -> usize { - debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes"); - let num_children = child_chaining_values.len() / OUT_LEN; - debug_assert!(num_children >= 2, "not enough children"); - debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many"); - - let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN); - // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of - // the requirements of compress_subtree_wide(). - let mut parents_array = ArrayVec::<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE_OR_2>::new(); - for parent in &mut parents_exact { - parents_array.push(array_ref!(parent, 0, BLOCK_LEN)); - } - platform.hash_many( - &parents_array, - key, - 0, // Parents always use counter 0. - IncrementCounter::No, - flags | PARENT, - 0, // Parents have no start flags. - 0, // Parents have no end flags. - out, - ); - - // If there's an odd child left over, it becomes an output. - let parents_so_far = parents_array.len(); - if !parents_exact.remainder().is_empty() { - out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder()); - parents_so_far + 1 - } else { - parents_so_far - } -} - // The wide helper function returns (writes out) an array of chaining values // and returns the length of that array. The number of chaining values returned // is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, @@ -707,66 +483,41 @@ fn compress_parents_parallel( // multithreading parallelism for that update(). fn compress_subtree_wide( input: &[u8], - key: &CVWords, + key: &CVBytes, chunk_counter: u64, - flags: u8, - platform: Platform, - out: &mut [u8], + flags: u32, + out: guts::TransposedSplit, ) -> usize { // Note that the single chunk case does *not* bump the SIMD degree up to 2 // when it is 1. This allows Rayon the option of multithreading even the // 2-chunk case, which can help performance on smaller platforms. - if input.len() <= platform.simd_degree() * CHUNK_LEN { - return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out); + let degree = guts::DETECTED_IMPL.degree(); + if input.len() <= degree * CHUNK_LEN { + return guts::DETECTED_IMPL.hash_chunks(input, key, chunk_counter, flags, out); } // With more than simd_degree chunks, we need to recurse. Start by dividing // the input into left and right subtrees. (Note that this is only optimal // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree // of 3 or something, we'll need a more complicated strategy.) - debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2"); - let (left, right) = input.split_at(left_len(input.len())); + debug_assert_eq!(degree.count_ones(), 1, "power of 2"); + let (left, right) = input.split_at(guts::left_len(input.len())); let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64; - // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to - // account for the special case of returning 2 outputs when the SIMD degree - // is 1. - let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; - let degree = if left.len() == CHUNK_LEN { - // The "simd_degree=1 and we're at the leaf nodes" case. - debug_assert_eq!(platform.simd_degree(), 1); - 1 - } else { - cmp::max(platform.simd_degree(), 2) - }; - let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN); + let mut transposed_cvs = guts::TransposedVectors::new(); + let (left_cvs, right_cvs) = guts::DETECTED_IMPL.split_transposed_vectors(&mut transposed_cvs); // Recurse! For update_rayon(), this is where we take advantage of RayonJoin and use multiple // threads. let (left_n, right_n) = J::join( - || compress_subtree_wide::(left, key, chunk_counter, flags, platform, left_out), - || compress_subtree_wide::(right, key, right_chunk_counter, flags, platform, right_out), + || compress_subtree_wide::(left, key, chunk_counter, flags, left_cvs), + || compress_subtree_wide::(right, key, right_chunk_counter, flags, right_cvs), ); - // The special case again. If simd_degree=1, then we'll have left_n=1 and - // right_n=1. Rather than compressing them into a single output, return - // them directly, to make sure we always have at least two outputs. - debug_assert_eq!(left_n, degree); - debug_assert!(right_n >= 1 && right_n <= left_n); - if left_n == 1 { - out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]); - return 2; - } - - // Otherwise, do one layer of parent node compression. - let num_children = left_n + right_n; - compress_parents_parallel( - &cv_array[..num_children * OUT_LEN], - key, - flags, - platform, - out, - ) + // Do one layer of parent node compression. The SIMD degree is always at least 2, so we're + // guaranteed that this isn't the root compression. + let num_cvs = left_n + right_n; + guts::DETECTED_IMPL.hash_parents(&mut transposed_cvs, num_cvs, key, flags, out) } // Hash a subtree with compress_subtree_wide(), and then condense the resulting @@ -781,50 +532,41 @@ fn compress_subtree_wide( // chunk or less. That's a different codepath. fn compress_subtree_to_parent_node( input: &[u8], - key: &CVWords, + key: &CVBytes, chunk_counter: u64, - flags: u8, - platform: Platform, -) -> [u8; BLOCK_LEN] { + flags: u32, +) -> BlockBytes { debug_assert!(input.len() > CHUNK_LEN); - let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; - let mut num_cvs = - compress_subtree_wide::(input, &key, chunk_counter, flags, platform, &mut cv_array); + let mut transposed_cvs = guts::TransposedVectors::new(); + let (left_cvs, _) = guts::DETECTED_IMPL.split_transposed_vectors(&mut transposed_cvs); + let mut num_cvs = compress_subtree_wide::(input, &key, chunk_counter, flags, left_cvs); debug_assert!(num_cvs >= 2); // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, // compress_subtree_wide() returns more than 2 chaining values. Condense // them into 2 by forming parent nodes repeatedly. - let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2]; while num_cvs > 2 { - let cv_slice = &cv_array[..num_cvs * OUT_LEN]; - num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array); - cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]); + num_cvs = guts::DETECTED_IMPL.reduce_parents(&mut transposed_cvs, num_cvs, key, flags); } - *array_ref!(cv_array, 0, 2 * OUT_LEN) + transposed_cvs.extract_parent_node(0) } // Hash a complete input all at once. Unlike compress_subtree_wide() and // compress_subtree_to_parent_node(), this function handles the 1 chunk case. -fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output { - let platform = Platform::detect(); - +fn hash_all_at_once(input: &[u8], key: &CVBytes, flags: u32) -> Output { // If the whole subtree is one chunk, hash it directly with a ChunkState. if input.len() <= CHUNK_LEN { - return ChunkState::new(key, 0, flags, platform) - .update(input) - .output(); + return ChunkState::new(key, 0, flags).update(input).output(); } // Otherwise construct an Output object from the parent node returned by // compress_subtree_to_parent_node(). Output { input_chaining_value: *key, - block: compress_subtree_to_parent_node::(input, key, 0, flags, platform), + block: compress_subtree_to_parent_node::(input, key, 0, flags), block_len: BLOCK_LEN as u8, counter: 0, - flags: flags | PARENT, - platform, + flags: flags as u8 | PARENT as u8, } } @@ -839,7 +581,7 @@ fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Ou /// This function is always single-threaded. For multithreading support, see /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). pub fn hash(input: &[u8]) -> Hash { - hash_all_at_once::(input, IV, 0).root_hash() + hash_all_at_once::(input, &IV_BYTES, 0).root_hash() } /// The keyed hash function. @@ -856,9 +598,8 @@ pub fn hash(input: &[u8]) -> Hash { /// This function is always single-threaded. For multithreading support, see /// [`Hasher::new_keyed`] and /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). -pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { - let key_words = platform::words_from_le_bytes_32(key); - hash_all_at_once::(input, &key_words, KEYED_HASH).root_hash() +pub fn keyed_hash(key: &CVBytes, input: &[u8]) -> Hash { + hash_all_at_once::(input, key, KEYED_HASH).root_hash() } /// The key derivation function. @@ -896,12 +637,11 @@ pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). /// /// [Argon2]: https://en.wikipedia.org/wiki/Argon2 -pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] { +pub fn derive_key(context: &str, key_material: &[u8]) -> CVBytes { let context_key = - hash_all_at_once::(context.as_bytes(), IV, DERIVE_KEY_CONTEXT) + hash_all_at_once::(context.as_bytes(), &IV_BYTES, DERIVE_KEY_CONTEXT) .root_hash(); - let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); - hash_all_at_once::(key_material, &context_key_words, DERIVE_KEY_MATERIAL) + hash_all_at_once::(key_material, context_key.as_bytes(), DERIVE_KEY_MATERIAL) .root_hash() .0 } @@ -909,9 +649,8 @@ pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] { fn parent_node_output( left_child: &CVBytes, right_child: &CVBytes, - key: &CVWords, - flags: u8, - platform: Platform, + key: &CVBytes, + flags: u32, ) -> Output { let mut block = [0; BLOCK_LEN]; block[..32].copy_from_slice(left_child); @@ -921,8 +660,7 @@ fn parent_node_output( block, block_len: BLOCK_LEN as u8, counter: 0, - flags: flags | PARENT, - platform, + flags: (flags | PARENT) as u8, } } @@ -963,7 +701,7 @@ fn parent_node_output( #[derive(Clone)] #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))] pub struct Hasher { - key: CVWords, + key: CVBytes, chunk_state: ChunkState, // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk @@ -974,26 +712,25 @@ pub struct Hasher { } impl Hasher { - fn new_internal(key: &CVWords, flags: u8) -> Self { + fn new_internal(key: &CVBytes, flags: u32) -> Self { Self { key: *key, - chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), + chunk_state: ChunkState::new(key, 0, flags), cv_stack: ArrayVec::new(), } } /// Construct a new `Hasher` for the regular hash function. pub fn new() -> Self { - Self::new_internal(IV, 0) + Self::new_internal(&IV_BYTES, 0) } /// Construct a new `Hasher` for the keyed hash function. See /// [`keyed_hash`]. /// /// [`keyed_hash`]: fn.keyed_hash.html - pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { - let key_words = platform::words_from_le_bytes_32(key); - Self::new_internal(&key_words, KEYED_HASH) + pub fn new_keyed(key: &CVBytes) -> Self { + Self::new_internal(key, KEYED_HASH) } /// Construct a new `Hasher` for the key derivation function. See @@ -1003,10 +740,9 @@ impl Hasher { /// [`derive_key`]: fn.derive_key.html pub fn new_derive_key(context: &str) -> Self { let context_key = - hash_all_at_once::(context.as_bytes(), IV, DERIVE_KEY_CONTEXT) + hash_all_at_once::(context.as_bytes(), &IV_BYTES, DERIVE_KEY_CONTEXT) .root_hash(); - let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); - Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL) + Self::new_internal(context_key.as_bytes(), DERIVE_KEY_MATERIAL) } /// Reset the `Hasher` to its initial state. @@ -1014,12 +750,7 @@ impl Hasher { /// This is functionally the same as overwriting the `Hasher` with a new /// one, using the same key or context string if any. pub fn reset(&mut self) -> &mut Self { - self.chunk_state = ChunkState::new( - &self.key, - 0, - self.chunk_state.flags, - self.chunk_state.platform, - ); + self.chunk_state = ChunkState::new(&self.key, 0, self.chunk_state.flags as u32); self.cv_stack.clear(); self } @@ -1044,8 +775,7 @@ impl Hasher { &left_child, &right_child, &self.key, - self.chunk_state.flags, - self.chunk_state.platform, + self.chunk_state.flags as u32, ); self.cv_stack.push(parent_output.chaining_value()); } @@ -1118,8 +848,7 @@ impl Hasher { self.chunk_state = ChunkState::new( &self.key, self.chunk_state.chunk_counter + 1, - self.chunk_state.flags, - self.chunk_state.platform, + self.chunk_state.flags as u32, ); } else { return self; @@ -1142,7 +871,7 @@ impl Hasher { while input.len() > CHUNK_LEN { debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data"); debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); - let mut subtree_len = largest_power_of_two_leq(input.len()); + let mut subtree_len = guts::largest_power_of_two_leq(input.len()); let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; // Shrink the subtree_len until it evenly divides the count so far. // We know that subtree_len itself is a power of 2, so we can use a @@ -1174,8 +903,7 @@ impl Hasher { &ChunkState::new( &self.key, self.chunk_state.chunk_counter, - self.chunk_state.flags, - self.chunk_state.platform, + self.chunk_state.flags as u32, ) .update(&input[..subtree_len]) .output() @@ -1189,11 +917,10 @@ impl Hasher { &input[..subtree_len], &self.key, self.chunk_state.chunk_counter, - self.chunk_state.flags, - self.chunk_state.platform, + self.chunk_state.flags as u32, ); - let left_cv = array_ref!(cv_pair, 0, 32); - let right_cv = array_ref!(cv_pair, 32, 32); + let left_cv = cv_pair[..32].try_into().unwrap(); + let right_cv = cv_pair[32..].try_into().unwrap(); // Push the two CVs we received into the CV stack in order. Because // the stack merges lazily, this guarantees we aren't merging the // root. @@ -1256,8 +983,7 @@ impl Hasher { &self.cv_stack[num_cvs_remaining - 2], &self.cv_stack[num_cvs_remaining - 1], &self.key, - self.chunk_state.flags, - self.chunk_state.platform, + self.chunk_state.flags as u32, ); num_cvs_remaining -= 2; } @@ -1266,8 +992,7 @@ impl Hasher { &self.cv_stack[num_cvs_remaining - 1], &output.chaining_value(), &self.key, - self.chunk_state.flags, - self.chunk_state.platform, + self.chunk_state.flags as u32, ); num_cvs_remaining -= 1; } @@ -1481,7 +1206,6 @@ impl fmt::Debug for Hasher { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Hasher") .field("flags", &self.chunk_state.flags) - .field("platform", &self.chunk_state.platform) .finish() } } @@ -1546,6 +1270,62 @@ impl OutputReader { } } + // There's some nontrivial logic here to handle partial blocks, and I don't want to copy-paste + // it between the xof and xof_xor cases. + #[inline(always)] + fn fill_inner(&mut self, mut buf: &mut [u8], xor: bool) { + debug_assert!(self.position_within_block < BLOCK_LEN as u8); + let xof_fn = if xor { + guts::Implementation::xof_xor + } else { + guts::Implementation::xof + }; + if self.position_within_block != 0 { + // The xof() and xof_xor() APIs can handle a partial block at the end but not a partial + // block at the beginning. We handle the beginning case here. Start by computing the + // complete block that we need part of. + let mut partial_block = [0u8; 64]; + xof_fn( + &guts::DETECTED_IMPL, + &self.inner.block, + self.inner.block_len as u32, + &self.inner.input_chaining_value, + self.inner.counter, + self.inner.flags as u32, + &mut partial_block, + ); + let output_bytes = &partial_block[self.position_within_block as usize..]; + let take = cmp::min(buf.len(), output_bytes.len()); + if xor { + for byte_index in 0..take { + buf[byte_index] ^= output_bytes[byte_index]; + } + } else { + buf[..take].copy_from_slice(&output_bytes[..take]); + } + buf = &mut buf[take..]; + self.position_within_block += take as u8; + if self.position_within_block == BLOCK_LEN as u8 { + self.position_within_block = 0; + self.inner.counter += 1; + } else { + debug_assert!(buf.is_empty()); + return; + } + } + xof_fn( + &guts::DETECTED_IMPL, + &self.inner.block, + self.inner.block_len as u32, + &self.inner.input_chaining_value, + self.inner.counter, + self.inner.flags as u32, + buf, + ); + self.inner.counter += (buf.len() / BLOCK_LEN) as u64; + self.position_within_block = (buf.len() % BLOCK_LEN) as u8; + } + /// Fill a buffer with output bytes and advance the position of the /// `OutputReader`. This is equivalent to [`Read::read`], except that it /// doesn't return a `Result`. Both methods always fill the entire buffer. @@ -1561,19 +1341,12 @@ impl OutputReader { /// reading further, the behavior is unspecified. /// /// [`Read::read`]: #method.read - pub fn fill(&mut self, mut buf: &mut [u8]) { - while !buf.is_empty() { - let block: [u8; BLOCK_LEN] = self.inner.root_output_block(); - let output_bytes = &block[self.position_within_block as usize..]; - let take = cmp::min(buf.len(), output_bytes.len()); - buf[..take].copy_from_slice(&output_bytes[..take]); - buf = &mut buf[take..]; - self.position_within_block += take as u8; - if self.position_within_block == BLOCK_LEN as u8 { - self.inner.counter += 1; - self.position_within_block = 0; - } - } + pub fn fill(&mut self, buf: &mut [u8]) { + self.fill_inner(buf, false); + } + + pub fn fill_xor(&mut self, buf: &mut [u8]) { + self.fill_inner(buf, true); } /// Return the current read position in the output stream. This is diff --git a/src/portable.rs b/src/portable.rs index 7af6828b0..b82bd7f21 100644 --- a/src/portable.rs +++ b/src/portable.rs @@ -181,10 +181,10 @@ pub fn hash_many( pub mod test { use super::*; - // This is basically testing the portable implementation against itself, - // but it also checks that compress_in_place and compress_xof are - // consistent. And there are tests against the reference implementation and - // against hardcoded test vectors elsewhere. + // These are basically testing the portable implementation against itself, but we also check + // that compress_in_place and compress_xof are consistent. And there are tests against the + // reference implementation and against hardcoded test vectors elsewhere. + #[test] fn test_compress() { crate::test::test_compress_fn(compress_in_place, compress_xof); diff --git a/src/test.rs b/src/test.rs index fb1e8495e..8d5c6ec7c 100644 --- a/src/test.rs +++ b/src/test.rs @@ -1,6 +1,7 @@ -use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; -use arrayref::array_ref; -use arrayvec::ArrayVec; +use blake3_guts as guts; +use guts::{CVBytes, CVWords, BLOCK_LEN, CHUNK_LEN}; + +use core::cmp; use core::usize; use rand::prelude::*; @@ -46,172 +47,12 @@ pub const TEST_CASES: &[usize] = &[ pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; // There's a test to make sure these two are equal below. -pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; -pub const TEST_KEY_WORDS: CVWords = [ - 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, -]; - -// Paint the input with a repeating byte pattern. We use a cycle length of 251, -// because that's the largest prime number less than 256. This makes it -// unlikely to swapping any two adjacent input blocks or chunks will give the -// same answer. -pub fn paint_test_input(buf: &mut [u8]) { - for (i, b) in buf.iter_mut().enumerate() { - *b = (i % 251) as u8; - } -} - -type CompressInPlaceFn = - unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); - -type CompressXofFn = unsafe fn( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64]; - -// A shared helper function for platform-specific tests. -pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { - let initial_state = TEST_KEY_WORDS; - let block_len: u8 = 61; - let mut block = [0; BLOCK_LEN]; - paint_test_input(&mut block[..block_len as usize]); - // Use a counter with set bits in both 32-bit words. - let counter = (5u64 << 32) + 6; - let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH; - - let portable_out = - crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags); - - let mut test_state = initial_state; - unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) }; - let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state); - let test_xof = - unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) }; - - assert_eq!(&portable_out[..32], &test_state_bytes[..]); - assert_eq!(&portable_out[..], &test_xof[..]); -} - -type HashManyFn = unsafe fn( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -); - -// A shared helper function for platform-specific tests. -pub fn test_hash_many_fn( - hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>, - hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>, -) { - // Test a few different initial counter values. - // - 0: The base case. - // - u32::MAX: The low word of the counter overflows for all inputs except the first. - // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR - // when you're supposed to ANDNOT... - let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; - for counter in initial_counters { - #[cfg(feature = "std")] - dbg!(counter); - - // 31 (16 + 8 + 4 + 2 + 1) inputs - const NUM_INPUTS: usize = 31; - let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; - crate::test::paint_test_input(&mut input_buf); - - // First hash chunks. - let mut chunks = ArrayVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new(); - for i in 0..NUM_INPUTS { - chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); - } - let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; - crate::portable::hash_many( - &chunks, - &TEST_KEY_WORDS, - counter, - IncrementCounter::Yes, - crate::KEYED_HASH, - crate::CHUNK_START, - crate::CHUNK_END, - &mut portable_chunks_out, - ); - - let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; - unsafe { - hash_many_chunks_fn( - &chunks[..], - &TEST_KEY_WORDS, - counter, - IncrementCounter::Yes, - crate::KEYED_HASH, - crate::CHUNK_START, - crate::CHUNK_END, - &mut test_chunks_out, - ); - } - for n in 0..NUM_INPUTS { - #[cfg(feature = "std")] - dbg!(n); - assert_eq!( - &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], - &test_chunks_out[n * OUT_LEN..][..OUT_LEN] - ); - } - - // Then hash parents. - let mut parents = ArrayVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new(); - for i in 0..NUM_INPUTS { - parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); - } - let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; - crate::portable::hash_many( - &parents, - &TEST_KEY_WORDS, - counter, - IncrementCounter::No, - crate::KEYED_HASH | crate::PARENT, - 0, - 0, - &mut portable_parents_out, - ); - - let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; - unsafe { - hash_many_parents_fn( - &parents[..], - &TEST_KEY_WORDS, - counter, - IncrementCounter::No, - crate::KEYED_HASH | crate::PARENT, - 0, - 0, - &mut test_parents_out, - ); - } - for n in 0..NUM_INPUTS { - #[cfg(feature = "std")] - dbg!(n); - assert_eq!( - &portable_parents_out[n * OUT_LEN..][..OUT_LEN], - &test_parents_out[n * OUT_LEN..][..OUT_LEN] - ); - } - } -} +pub const TEST_KEY: &CVBytes = b"whats the Elvish word for friend"; +pub const TEST_KEY_WORDS: &CVWords = &guts::words_from_le_bytes_32(TEST_KEY); #[test] fn test_key_bytes_equal_key_words() { - assert_eq!( - TEST_KEY_WORDS, - crate::platform::words_from_le_bytes_32(&TEST_KEY), - ); + assert_eq!(TEST_KEY, &guts::le_bytes_from_words_32(TEST_KEY_WORDS),); } #[test] @@ -224,52 +65,9 @@ fn test_reference_impl_size() { assert_eq!(1880, core::mem::size_of::()); } -#[test] -fn test_counter_words() { - let counter: u64 = (1 << 32) + 2; - assert_eq!(crate::counter_low(counter), 2); - assert_eq!(crate::counter_high(counter), 1); -} - -#[test] -fn test_largest_power_of_two_leq() { - let input_output = &[ - // The zero case is nonsensical, but it does work. - (0, 1), - (1, 1), - (2, 2), - (3, 2), - (4, 4), - (5, 4), - (6, 4), - (7, 4), - (8, 8), - // the largest possible usize - (usize::MAX, (usize::MAX >> 1) + 1), - ]; - for &(input, output) in input_output { - assert_eq!( - output, - crate::largest_power_of_two_leq(input), - "wrong output for n={}", - input - ); - } -} - -#[test] -fn test_left_len() { - let input_output = &[ - (CHUNK_LEN + 1, CHUNK_LEN), - (2 * CHUNK_LEN - 1, CHUNK_LEN), - (2 * CHUNK_LEN, CHUNK_LEN), - (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN), - (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN), - (4 * CHUNK_LEN, 2 * CHUNK_LEN), - (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN), - ]; - for &(input, output) in input_output { - assert_eq!(crate::left_len(input), output); +pub(crate) fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; } } @@ -292,18 +90,18 @@ fn test_compare_reference_impl() { // all at once let test_out = crate::hash(input); - assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); + assert_eq!(test_out, expected_out[..32]); // incremental let mut hasher = crate::Hasher::new(); hasher.update(input); - assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), expected_out[..32]); assert_eq!(hasher.finalize(), test_out); // incremental (rayon) #[cfg(feature = "rayon")] { let mut hasher = crate::Hasher::new(); hasher.update_rayon(input); - assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), expected_out[..32]); assert_eq!(hasher.finalize(), test_out); } // xof @@ -314,25 +112,25 @@ fn test_compare_reference_impl() { // keyed { - let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + let mut reference_hasher = reference_impl::Hasher::new_keyed(TEST_KEY); reference_hasher.update(input); let mut expected_out = [0; OUT]; reference_hasher.finalize(&mut expected_out); // all at once - let test_out = crate::keyed_hash(&TEST_KEY, input); - assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); + let test_out = crate::keyed_hash(TEST_KEY, input); + assert_eq!(test_out, expected_out[..32]); // incremental - let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); + let mut hasher = crate::Hasher::new_keyed(TEST_KEY); hasher.update(input); - assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), expected_out[..32]); assert_eq!(hasher.finalize(), test_out); // incremental (rayon) #[cfg(feature = "rayon")] { - let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); + let mut hasher = crate::Hasher::new_keyed(TEST_KEY); hasher.update_rayon(input); - assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); + assert_eq!(hasher.finalize(), expected_out[..32]); assert_eq!(hasher.finalize(), test_out); } // xof @@ -355,15 +153,15 @@ fn test_compare_reference_impl() { // incremental let mut hasher = crate::Hasher::new_derive_key(context); hasher.update(input); - assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); - assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); + assert_eq!(hasher.finalize(), expected_out[..32]); + assert_eq!(hasher.finalize(), test_out[..32]); // incremental (rayon) #[cfg(feature = "rayon")] { let mut hasher = crate::Hasher::new_derive_key(context); hasher.update_rayon(input); - assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); - assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); + assert_eq!(hasher.finalize(), expected_out[..32]); + assert_eq!(hasher.finalize(), test_out[..32]); } // xof let mut extended = [0; OUT]; @@ -423,9 +221,9 @@ fn test_fuzz_hasher() { let mut input_buf = [0; 3 * INPUT_MAX]; paint_test_input(&mut input_buf); - // Don't do too many iterations in debug mode, to keep the tests under a - // second or so. CI should run tests in release mode also. Provide an - // environment variable for specifying a larger number of fuzz iterations. + // Don't do too many iterations in debug mode, to keep the tests under a second or so. CI + // should run tests in release mode also. + // TODO: Provide an environment variable for specifying a larger number of fuzz iterations? let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; // Use a fixed RNG seed for reproducibility. @@ -493,6 +291,133 @@ fn test_xof_seek() { } } +#[test] +fn test_xof_xor() { + for step in [32, 63, 64, 128, 303] { + #[cfg(feature = "std")] + dbg!(step); + let mut ref_hasher = reference_impl::Hasher::new(); + ref_hasher.update(b"foo"); + let mut ref_output = [0u8; 1000]; + ref_hasher.finalize(&mut ref_output); + + let mut hasher = crate::Hasher::new(); + hasher.update(b"foo"); + let mut reader = hasher.finalize_xof(); + + let mut test_output = [0u8; 1000]; + for chunk in test_output.chunks_mut(step) { + reader.fill(chunk); + } + assert_eq!(ref_output, test_output); + // Xor'ing the same output should zero the buffer. + reader.set_position(0); + for chunk in test_output.chunks_mut(step) { + reader.fill_xor(chunk); + } + assert_eq!([0u8; 1000], test_output); + // Xor'ing the same output again should reproduce the original. + reader.set_position(0); + for chunk in test_output.chunks_mut(step) { + reader.fill_xor(chunk); + } + assert_eq!(ref_output, test_output); + + // Repeat the same test but starting at offset 500. + reader.set_position(500); + for chunk in test_output[..500].chunks_mut(step) { + reader.fill(chunk); + } + assert_eq!(ref_output[500..], test_output[..500]); + reader.set_position(500); + for chunk in test_output[..500].chunks_mut(step) { + reader.fill_xor(chunk); + } + assert_eq!([0u8; 500], test_output[..500]); + reader.set_position(500); + for chunk in test_output[..500].chunks_mut(step) { + reader.fill_xor(chunk); + } + assert_eq!(ref_output[500..], test_output[..500]); + } +} + +#[test] +#[cfg(feature = "std")] +fn test_fuzz_xof() { + // Use a fixed RNG seed for reproducibility. + let mut rng = rand_chacha::ChaCha8Rng::from_seed([99; 32]); + let random_key: [u8; 32] = rng.gen(); + + let possible_seeks = [-64i64, -63 - 1, 0, 1, 63, 64, 127, 128, 129]; + + const MAX_LEN: usize = 1100; + let possible_lengths = [0usize, 1, 63, 64, 65, 128, 256, 512, 1024, MAX_LEN]; + assert!(possible_lengths.into_iter().all(|x| x <= MAX_LEN)); + + let mut xof_output = crate::Hasher::new_keyed(&random_key).finalize_xof(); + let mut xof_xor_output = crate::Hasher::new_keyed(&random_key).finalize_xof(); + + // Don't do too many iterations in debug mode, to keep the tests under a second or so. CI + // should run tests in release mode also. + // TODO: Provide an environment variable for specifying a larger number of fuzz iterations? + let num_tests = if cfg!(debug_assertions) { + 1_000 + } else { + 100_000 + }; + + let mut position = 0; + let mut ref_output = Vec::new(); + for test_i in 0..num_tests { + eprintln!("--- test {test_i} ---"); + // Do a random relative seek maybe. Could be zero. + let relative_seek: i64 = *possible_seeks.choose(&mut rng).unwrap(); + dbg!(relative_seek); + if relative_seek != 0 { + let new_position = position as i64 + relative_seek; + if 0 <= new_position && new_position <= MAX_LEN as i64 { + position = new_position as u64; + } else { + position = 0; + } + assert_eq!(xof_output.position(), xof_xor_output.position()); + xof_output.set_position(position as u64); + xof_xor_output.set_position(position as u64); + } + dbg!(position); + + // Generate a random number of output bytes. If the amount of output we've gotten from the + // reference_impl isn't enough, double it. + let len: usize = *possible_lengths.choose(&mut rng).unwrap(); + dbg!(len); + if position as usize + len > ref_output.len() { + let new_len = cmp::max(MAX_LEN, 2 * ref_output.len()); + ref_output = vec![0u8; new_len]; + eprintln!("grow reference output length to {}", ref_output.len()); + let ref_hasher = reference_impl::Hasher::new_keyed(&random_key); + ref_hasher.finalize(&mut ref_output); + } + let mut buf = [0u8; MAX_LEN]; + xof_output.fill(&mut buf[..len]); + assert_eq!(ref_output[position as usize..][..len], buf[..len]); + assert_eq!([0u8; MAX_LEN][..MAX_LEN - len], buf[len..]); + + // Xor over the output with a random byte value, and then confirm that xof_xor() recovers + // that value. + let random_byte: u8 = rng.gen(); + dbg!(random_byte); + for i in 0..len { + buf[i] ^= random_byte; + } + xof_xor_output.fill_xor(&mut buf[..len]); + assert_eq!([random_byte; MAX_LEN][..len], buf[..len]); + assert_eq!([0u8; MAX_LEN][..MAX_LEN - len], buf[len..]); + + position += len as u64; + } +} + #[test] fn test_msg_schedule_permutation() { let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; @@ -506,7 +431,7 @@ fn test_msg_schedule_permutation() { } } - assert_eq!(generated, crate::MSG_SCHEDULE); + assert_eq!(generated, guts::MSG_SCHEDULE); } #[test] @@ -640,53 +565,43 @@ fn test_zeroize() { let mut hasher = crate::Hasher { chunk_state: crate::ChunkState { - cv: [42; 8], + cv: [42; 32], chunk_counter: 42, buf: [42; 64], buf_len: 42, blocks_compressed: 42, flags: 42, - platform: crate::Platform::Portable, }, - key: [42; 8], + key: [42; 32], cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(), }; hasher.zeroize(); - assert_eq!(hasher.chunk_state.cv, [0; 8]); + assert_eq!(hasher.chunk_state.cv, [0; 32]); assert_eq!(hasher.chunk_state.chunk_counter, 0); assert_eq!(hasher.chunk_state.buf, [0; 64]); assert_eq!(hasher.chunk_state.buf_len, 0); assert_eq!(hasher.chunk_state.blocks_compressed, 0); assert_eq!(hasher.chunk_state.flags, 0); - assert!(matches!( - hasher.chunk_state.platform, - crate::Platform::Portable - )); - assert_eq!(hasher.key, [0; 8]); + assert_eq!(hasher.key, [0; 32]); assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]); let mut output_reader = crate::OutputReader { inner: crate::Output { - input_chaining_value: [42; 8], + input_chaining_value: [42; 32], block: [42; 64], counter: 42, block_len: 42, flags: 42, - platform: crate::Platform::Portable, }, position_within_block: 42, }; output_reader.zeroize(); - assert_eq!(output_reader.inner.input_chaining_value, [0; 8]); + assert_eq!(output_reader.inner.input_chaining_value, [0; 32]); assert_eq!(output_reader.inner.block, [0; 64]); assert_eq!(output_reader.inner.counter, 0); assert_eq!(output_reader.inner.block_len, 0); assert_eq!(output_reader.inner.flags, 0); - assert!(matches!( - output_reader.inner.platform, - crate::Platform::Portable - )); assert_eq!(output_reader.position_within_block, 0); } diff --git a/test_vectors/Cargo.toml b/test_vectors/Cargo.toml index 87a9eba19..21098d41d 100644 --- a/test_vectors/Cargo.toml +++ b/test_vectors/Cargo.toml @@ -12,6 +12,7 @@ pure = ["blake3/pure"] # If you ever change these path dependencies, you'll probably need to update # cross_test.sh, or CI will break. I'm sorry >.< blake3 = { path = "../" } +blake3_guts = { path = "../rust/guts" } hex = "0.4.0" reference_impl = { path = "../reference_impl" } serde = { version = "1.0", features = ["derive"] } diff --git a/test_vectors/src/lib.rs b/test_vectors/src/lib.rs index 6a4c79846..ea2491a92 100644 --- a/test_vectors/src/lib.rs +++ b/test_vectors/src/lib.rs @@ -1,4 +1,4 @@ -use blake3::guts::{BLOCK_LEN, CHUNK_LEN}; +use blake3_guts::{BLOCK_LEN, CHUNK_LEN}; use serde::{Deserialize, Serialize}; // A non-multiple of 4 is important, since one possible bug is to fail to emit