From fe238e49a0b00bb3c4423a30f381154549278fd7 Mon Sep 17 00:00:00 2001
From: Jack O'Connor <oconnor663@gmail.com>
Date: Sun, 21 Jan 2024 21:15:45 -0800
Subject: [PATCH] factor out the `blake3` crate changes from the guts_api
 branch

This commit and the branch that it starts are unlikely to land as-is,
but I want to maintain them while I flesh out the new `blake3_guts`
sub-crate.
---
 Cargo.toml              |   2 +-
 b3sum/Cargo.lock        |  12 +-
 b3sum/src/main.rs       |   2 +-
 benches/bench.rs        | 175 +------------
 src/lib.rs              | 551 ++++++++++++----------------------------
 src/portable.rs         |   8 +-
 src/test.rs             | 407 ++++++++++++-----------------
 test_vectors/Cargo.toml |   1 +
 test_vectors/src/lib.rs |   2 +-
 9 files changed, 337 insertions(+), 823 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 591a6e32a..fe7343649 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -92,8 +92,8 @@ no_neon = []
 features = ["mmap", "rayon", "serde", "zeroize"]
 
 [dependencies]
-arrayref = "0.3.5"
 arrayvec = { version = "0.7.4", default-features = false }
+blake3_guts = { path = "rust/guts" }
 constant_time_eq = "0.3.0"
 cfg-if = "1.0.0"
 digest = { version = "0.10.1", features = [ "mac" ], optional = true }
diff --git a/b3sum/Cargo.lock b/b3sum/Cargo.lock
index 10caffb4a..7bab0b05c 100644
--- a/b3sum/Cargo.lock
+++ b/b3sum/Cargo.lock
@@ -56,12 +56,6 @@ version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca"
 
-[[package]]
-name = "arrayref"
-version = "0.3.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545"
-
 [[package]]
 name = "arrayvec"
 version = "0.7.4"
@@ -98,8 +92,8 @@ checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
 name = "blake3"
 version = "1.5.0"
 dependencies = [
- "arrayref",
  "arrayvec",
+ "blake3_guts",
  "cc",
  "cfg-if",
  "constant_time_eq",
@@ -107,6 +101,10 @@ dependencies = [
  "rayon",
 ]
 
+[[package]]
+name = "blake3_guts"
+version = "0.0.0"
+
 [[package]]
 name = "cc"
 version = "1.0.83"
diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs
index 228737ff0..3dadc002e 100644
--- a/b3sum/src/main.rs
+++ b/b3sum/src/main.rs
@@ -186,7 +186,7 @@ fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> Result<()>
     // TODO: This computes each output block twice when the --seek argument isn't a multiple of 64.
     // We'll refactor all of this soon anyway, once SIMD optimizations are available for the XOF.
     let mut len = args.len();
-    let mut block = [0; blake3::guts::BLOCK_LEN];
+    let mut block = [0; 64];
     while len > 0 {
         output.fill(&mut block);
         let hex_str = hex::encode(&block[..]);
diff --git a/benches/bench.rs b/benches/bench.rs
index 5efb9e6f5..e057d24e1 100644
--- a/benches/bench.rs
+++ b/benches/bench.rs
@@ -2,11 +2,7 @@
 
 extern crate test;
 
-use arrayref::array_ref;
-use arrayvec::ArrayVec;
-use blake3::guts::{BLOCK_LEN, CHUNK_LEN};
-use blake3::platform::{Platform, MAX_SIMD_DEGREE};
-use blake3::OUT_LEN;
+use blake3_guts::BLOCK_LEN;
 use rand::prelude::*;
 use test::Bencher;
 
@@ -49,175 +45,6 @@ impl RandomInput {
     }
 }
 
-fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) {
-    let mut state = [1u32; 8];
-    let mut r = RandomInput::new(b, 64);
-    let input = array_ref!(r.get(), 0, 64);
-    b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0));
-}
-
-#[bench]
-fn bench_single_compression_portable(b: &mut Bencher) {
-    bench_single_compression_fn(b, Platform::portable());
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_single_compression_sse2(b: &mut Bencher) {
-    if let Some(platform) = Platform::sse2() {
-        bench_single_compression_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_single_compression_sse41(b: &mut Bencher) {
-    if let Some(platform) = Platform::sse41() {
-        bench_single_compression_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(blake3_avx512_ffi)]
-fn bench_single_compression_avx512(b: &mut Bencher) {
-    if let Some(platform) = Platform::avx512() {
-        bench_single_compression_fn(b, platform);
-    }
-}
-
-fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
-    let degree = platform.simd_degree();
-    let mut inputs = Vec::new();
-    for _ in 0..degree {
-        inputs.push(RandomInput::new(b, CHUNK_LEN));
-    }
-    b.iter(|| {
-        let input_arrays: ArrayVec<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE> = inputs
-            .iter_mut()
-            .take(degree)
-            .map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
-            .collect();
-        let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
-        platform.hash_many(
-            &input_arrays[..],
-            &[0; 8],
-            0,
-            blake3::IncrementCounter::Yes,
-            0,
-            0,
-            0,
-            &mut out,
-        );
-    });
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_sse2(b: &mut Bencher) {
-    if let Some(platform) = Platform::sse2() {
-        bench_many_chunks_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_sse41(b: &mut Bencher) {
-    if let Some(platform) = Platform::sse41() {
-        bench_many_chunks_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_chunks_avx2(b: &mut Bencher) {
-    if let Some(platform) = Platform::avx2() {
-        bench_many_chunks_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(blake3_avx512_ffi)]
-fn bench_many_chunks_avx512(b: &mut Bencher) {
-    if let Some(platform) = Platform::avx512() {
-        bench_many_chunks_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(feature = "neon")]
-fn bench_many_chunks_neon(b: &mut Bencher) {
-    if let Some(platform) = Platform::neon() {
-        bench_many_chunks_fn(b, platform);
-    }
-}
-
-// TODO: When we get const generics we can unify this with the chunks code.
-fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
-    let degree = platform.simd_degree();
-    let mut inputs = Vec::new();
-    for _ in 0..degree {
-        inputs.push(RandomInput::new(b, BLOCK_LEN));
-    }
-    b.iter(|| {
-        let input_arrays: ArrayVec<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE> = inputs
-            .iter_mut()
-            .take(degree)
-            .map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
-            .collect();
-        let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
-        platform.hash_many(
-            &input_arrays[..],
-            &[0; 8],
-            0,
-            blake3::IncrementCounter::No,
-            0,
-            0,
-            0,
-            &mut out,
-        );
-    });
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_sse2(b: &mut Bencher) {
-    if let Some(platform) = Platform::sse2() {
-        bench_many_parents_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_sse41(b: &mut Bencher) {
-    if let Some(platform) = Platform::sse41() {
-        bench_many_parents_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn bench_many_parents_avx2(b: &mut Bencher) {
-    if let Some(platform) = Platform::avx2() {
-        bench_many_parents_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(blake3_avx512_ffi)]
-fn bench_many_parents_avx512(b: &mut Bencher) {
-    if let Some(platform) = Platform::avx512() {
-        bench_many_parents_fn(b, platform);
-    }
-}
-
-#[bench]
-#[cfg(feature = "neon")]
-fn bench_many_parents_neon(b: &mut Bencher) {
-    if let Some(platform) = Platform::neon() {
-        bench_many_parents_fn(b, platform);
-    }
-}
-
 fn bench_atonce(b: &mut Bencher, len: usize) {
     let mut input = RandomInput::new(b, len);
     b.iter(|| blake3::hash(input.get()));
diff --git a/src/lib.rs b/src/lib.rs
index 1fe47bf54..efc1fc4ee 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -88,109 +88,29 @@
 #[cfg(test)]
 mod test;
 
-// The guts module is for incremental use cases like the `bao` crate that need
-// to explicitly compute chunk and parent chaining values. It is semi-stable
-// and likely to keep working, but largely undocumented and not intended for
-// widespread use.
-#[doc(hidden)]
-pub mod guts;
-
-/// Undocumented and unstable, for benchmarks only.
-#[doc(hidden)]
-pub mod platform;
-
-// Platform-specific implementations of the compression function. These
-// BLAKE3-specific cfg flags are set in build.rs.
-#[cfg(blake3_avx2_rust)]
-#[path = "rust_avx2.rs"]
-mod avx2;
-#[cfg(blake3_avx2_ffi)]
-#[path = "ffi_avx2.rs"]
-mod avx2;
-#[cfg(blake3_avx512_ffi)]
-#[path = "ffi_avx512.rs"]
-mod avx512;
-#[cfg(blake3_neon)]
-#[path = "ffi_neon.rs"]
-mod neon;
-mod portable;
-#[cfg(blake3_sse2_rust)]
-#[path = "rust_sse2.rs"]
-mod sse2;
-#[cfg(blake3_sse2_ffi)]
-#[path = "ffi_sse2.rs"]
-mod sse2;
-#[cfg(blake3_sse41_rust)]
-#[path = "rust_sse41.rs"]
-mod sse41;
-#[cfg(blake3_sse41_ffi)]
-#[path = "ffi_sse41.rs"]
-mod sse41;
-
 #[cfg(feature = "traits-preview")]
 pub mod traits;
 
 mod io;
 mod join;
 
-use arrayref::{array_mut_ref, array_ref};
 use arrayvec::{ArrayString, ArrayVec};
 use core::cmp;
 use core::fmt;
-use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2};
 
-/// The number of bytes in a [`Hash`](struct.Hash.html), 32.
+use blake3_guts as guts;
+use guts::{
+    BlockBytes, CVBytes, BLOCK_LEN, CHUNK_END, CHUNK_LEN, CHUNK_START, DERIVE_KEY_CONTEXT,
+    DERIVE_KEY_MATERIAL, IV_BYTES, KEYED_HASH, PARENT, ROOT,
+};
+
+/// The number of bytes in a [`Hash`](struct.Hash.html), 32
 pub const OUT_LEN: usize = 32;
 
-/// The number of bytes in a key, 32.
+/// The number of bytes in a key, 32
 pub const KEY_LEN: usize = 32;
 
 const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64
-use guts::{BLOCK_LEN, CHUNK_LEN};
-
-// While iterating the compression function within a chunk, the CV is
-// represented as words, to avoid doing two extra endianness conversions for
-// each compression in the portable implementation. But the hash_many interface
-// needs to hash both input bytes and parent nodes, so its better for its
-// output CVs to be represented as bytes.
-type CVWords = [u32; 8];
-type CVBytes = [u8; 32]; // little-endian
-
-const IV: &CVWords = &[
-    0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19,
-];
-
-const MSG_SCHEDULE: [[usize; 16]; 7] = [
-    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-    [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8],
-    [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1],
-    [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6],
-    [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4],
-    [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7],
-    [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13],
-];
-
-// These are the internal flags that we use to domain separate root/non-root,
-// chunk/parent, and chunk beginning/middle/end. These get set at the high end
-// of the block flags word in the compression function, so their values start
-// high and go down.
-const CHUNK_START: u8 = 1 << 0;
-const CHUNK_END: u8 = 1 << 1;
-const PARENT: u8 = 1 << 2;
-const ROOT: u8 = 1 << 3;
-const KEYED_HASH: u8 = 1 << 4;
-const DERIVE_KEY_CONTEXT: u8 = 1 << 5;
-const DERIVE_KEY_MATERIAL: u8 = 1 << 6;
-
-#[inline]
-fn counter_low(counter: u64) -> u32 {
-    counter as u32
-}
-
-#[inline]
-fn counter_high(counter: u64) -> u32 {
-    (counter >> 32) as u32
-}
 
 /// An output of the default size, 32 bytes, which provides constant-time
 /// equality checking.
@@ -219,19 +139,19 @@ fn counter_high(counter: u64) -> u32 {
 #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
 #[derive(Clone, Copy, Hash)]
-pub struct Hash([u8; OUT_LEN]);
+pub struct Hash(CVBytes);
 
 impl Hash {
     /// The raw bytes of the `Hash`. Note that byte arrays don't provide
     /// constant-time equality checking, so if  you need to compare hashes,
     /// prefer the `Hash` type.
     #[inline]
-    pub const fn as_bytes(&self) -> &[u8; OUT_LEN] {
+    pub const fn as_bytes(&self) -> &CVBytes {
         &self.0
     }
 
     /// Create a `Hash` from its raw bytes representation.
-    pub const fn from_bytes(bytes: [u8; OUT_LEN]) -> Self {
+    pub const fn from_bytes(bytes: CVBytes) -> Self {
         Self(bytes)
     }
 
@@ -275,7 +195,7 @@ impl Hash {
         if hex_bytes.len() != OUT_LEN * 2 {
             return Err(HexError(HexErrorInner::InvalidLen(hex_bytes.len())));
         }
-        let mut hash_bytes: [u8; OUT_LEN] = [0; OUT_LEN];
+        let mut hash_bytes: CVBytes = [0; OUT_LEN];
         for i in 0..OUT_LEN {
             hash_bytes[i] = 16 * hex_val(hex_bytes[2 * i])? + hex_val(hex_bytes[2 * i + 1])?;
         }
@@ -283,14 +203,14 @@ impl Hash {
     }
 }
 
-impl From<[u8; OUT_LEN]> for Hash {
+impl From<CVBytes> for Hash {
     #[inline]
-    fn from(bytes: [u8; OUT_LEN]) -> Self {
+    fn from(bytes: CVBytes) -> Self {
         Self::from_bytes(bytes)
     }
 }
 
-impl From<Hash> for [u8; OUT_LEN] {
+impl From<Hash> for CVBytes {
     #[inline]
     fn from(hash: Hash) -> Self {
         hash.0
@@ -314,9 +234,9 @@ impl PartialEq for Hash {
 }
 
 /// This implementation is constant-time.
-impl PartialEq<[u8; OUT_LEN]> for Hash {
+impl PartialEq<CVBytes> for Hash {
     #[inline]
-    fn eq(&self, other: &[u8; OUT_LEN]) -> bool {
+    fn eq(&self, other: &CVBytes) -> bool {
         constant_time_eq::constant_time_eq_32(&self.0, other)
     }
 }
@@ -395,70 +315,56 @@ impl std::error::Error for HexError {}
 #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 #[derive(Clone)]
 struct Output {
-    input_chaining_value: CVWords,
-    block: [u8; 64],
+    input_chaining_value: CVBytes,
+    block: BlockBytes,
     block_len: u8,
     counter: u64,
     flags: u8,
-    #[cfg_attr(feature = "zeroize", zeroize(skip))]
-    platform: Platform,
 }
 
 impl Output {
     fn chaining_value(&self) -> CVBytes {
-        let mut cv = self.input_chaining_value;
-        self.platform.compress_in_place(
-            &mut cv,
+        guts::DETECTED_IMPL.compress(
             &self.block,
-            self.block_len,
+            self.block_len as u32,
+            &self.input_chaining_value,
             self.counter,
-            self.flags,
-        );
-        platform::le_bytes_from_words_32(&cv)
+            self.flags as u32,
+        )
     }
 
     fn root_hash(&self) -> Hash {
         debug_assert_eq!(self.counter, 0);
-        let mut cv = self.input_chaining_value;
-        self.platform
-            .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT);
-        Hash(platform::le_bytes_from_words_32(&cv))
-    }
-
-    fn root_output_block(&self) -> [u8; 2 * OUT_LEN] {
-        self.platform.compress_xof(
-            &self.input_chaining_value,
+        Hash(guts::DETECTED_IMPL.compress(
             &self.block,
-            self.block_len,
-            self.counter,
-            self.flags | ROOT,
-        )
+            self.block_len as u32,
+            &self.input_chaining_value,
+            0,
+            self.flags as u32 | ROOT,
+        ))
     }
 }
 
 #[derive(Clone)]
 #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 struct ChunkState {
-    cv: CVWords,
+    cv: CVBytes,
     chunk_counter: u64,
-    buf: [u8; BLOCK_LEN],
+    buf: BlockBytes,
     buf_len: u8,
     blocks_compressed: u8,
     flags: u8,
-    #[cfg_attr(feature = "zeroize", zeroize(skip))]
-    platform: Platform,
 }
 
 impl ChunkState {
-    fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self {
+    fn new(key: &CVBytes, chunk_counter: u64, flags: u32) -> Self {
         Self {
             cv: *key,
             chunk_counter,
             buf: [0; BLOCK_LEN],
             buf_len: 0,
             blocks_compressed: 0,
-            flags,
-            platform,
+            flags: flags as u8,
         }
     }
 
@@ -474,7 +380,7 @@ impl ChunkState {
         *input = &input[take..];
     }
 
-    fn start_flag(&self) -> u8 {
+    fn start_flag(&self) -> u32 {
         if self.blocks_compressed == 0 {
             CHUNK_START
         } else {
@@ -489,13 +395,12 @@ impl ChunkState {
             self.fill_buf(&mut input);
             if !input.is_empty() {
                 debug_assert_eq!(self.buf_len as usize, BLOCK_LEN);
-                let block_flags = self.flags | self.start_flag(); // borrowck
-                self.platform.compress_in_place(
-                    &mut self.cv,
+                self.cv = guts::DETECTED_IMPL.compress(
                     &self.buf,
-                    BLOCK_LEN as u8,
+                    BLOCK_LEN as u32,
+                    &self.cv,
                     self.chunk_counter,
-                    block_flags,
+                    self.flags as u32 | self.start_flag(),
                 );
                 self.buf_len = 0;
                 self.buf = [0; BLOCK_LEN];
@@ -505,13 +410,12 @@ impl ChunkState {
 
         while input.len() > BLOCK_LEN {
             debug_assert_eq!(self.buf_len, 0);
-            let block_flags = self.flags | self.start_flag(); // borrowck
-            self.platform.compress_in_place(
-                &mut self.cv,
-                array_ref!(input, 0, BLOCK_LEN),
-                BLOCK_LEN as u8,
+            self.cv = guts::DETECTED_IMPL.compress(
+                input[..BLOCK_LEN].try_into().unwrap(),
+                BLOCK_LEN as u32,
+                &self.cv,
                 self.chunk_counter,
-                block_flags,
+                self.flags as u32 | self.start_flag(),
             );
             self.blocks_compressed += 1;
             input = &input[BLOCK_LEN..];
@@ -524,14 +428,12 @@ impl ChunkState {
     }
 
     fn output(&self) -> Output {
-        let block_flags = self.flags | self.start_flag() | CHUNK_END;
         Output {
             input_chaining_value: self.cv,
             block: self.buf,
             block_len: self.buf_len,
             counter: self.chunk_counter,
-            flags: block_flags,
-            platform: self.platform,
+            flags: self.flags | self.start_flag() as u8 | CHUNK_END as u8,
         }
     }
 }
@@ -543,7 +445,6 @@ impl fmt::Debug for ChunkState {
             .field("len", &self.len())
             .field("chunk_counter", &self.chunk_counter)
             .field("flags", &self.flags)
-            .field("platform", &self.platform)
             .finish()
     }
 }
@@ -563,131 +464,6 @@ impl fmt::Debug for ChunkState {
 //   use full-width SIMD vectors for parent hashing. Without parallel parent
 //   hashing, we lose about 10% of overall throughput on AVX2 and AVX-512.
 
-/// Undocumented and unstable, for benchmarks only.
-#[doc(hidden)]
-#[derive(Clone, Copy)]
-pub enum IncrementCounter {
-    Yes,
-    No,
-}
-
-impl IncrementCounter {
-    #[inline]
-    fn yes(&self) -> bool {
-        match self {
-            IncrementCounter::Yes => true,
-            IncrementCounter::No => false,
-        }
-    }
-}
-
-// The largest power of two less than or equal to `n`, used for left_len()
-// immediately below, and also directly in Hasher::update().
-fn largest_power_of_two_leq(n: usize) -> usize {
-    ((n / 2) + 1).next_power_of_two()
-}
-
-// Given some input larger than one chunk, return the number of bytes that
-// should go in the left subtree. This is the largest power-of-2 number of
-// chunks that leaves at least 1 byte for the right subtree.
-fn left_len(content_len: usize) -> usize {
-    debug_assert!(content_len > CHUNK_LEN);
-    // Subtract 1 to reserve at least one byte for the right side.
-    let full_chunks = (content_len - 1) / CHUNK_LEN;
-    largest_power_of_two_leq(full_chunks) * CHUNK_LEN
-}
-
-// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
-// on a single thread. Write out the chunk chaining values and return the
-// number of chunks hashed. These chunks are never the root and never empty;
-// those cases use a different codepath.
-fn compress_chunks_parallel(
-    input: &[u8],
-    key: &CVWords,
-    chunk_counter: u64,
-    flags: u8,
-    platform: Platform,
-    out: &mut [u8],
-) -> usize {
-    debug_assert!(!input.is_empty(), "empty chunks below the root");
-    debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN);
-
-    let mut chunks_exact = input.chunks_exact(CHUNK_LEN);
-    let mut chunks_array = ArrayVec::<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE>::new();
-    for chunk in &mut chunks_exact {
-        chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN));
-    }
-    platform.hash_many(
-        &chunks_array,
-        key,
-        chunk_counter,
-        IncrementCounter::Yes,
-        flags,
-        CHUNK_START,
-        CHUNK_END,
-        out,
-    );
-
-    // Hash the remaining partial chunk, if there is one. Note that the empty
-    // chunk (meaning the empty message) is a different codepath.
-    let chunks_so_far = chunks_array.len();
-    if !chunks_exact.remainder().is_empty() {
-        let counter = chunk_counter + chunks_so_far as u64;
-        let mut chunk_state = ChunkState::new(key, counter, flags, platform);
-        chunk_state.update(chunks_exact.remainder());
-        *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) =
-            chunk_state.output().chaining_value();
-        chunks_so_far + 1
-    } else {
-        chunks_so_far
-    }
-}
-
-// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
-// on a single thread. Write out the parent chaining values and return the
-// number of parents hashed. (If there's an odd input chaining value left over,
-// return it as an additional output.) These parents are never the root and
-// never empty; those cases use a different codepath.
-fn compress_parents_parallel(
-    child_chaining_values: &[u8],
-    key: &CVWords,
-    flags: u8,
-    platform: Platform,
-    out: &mut [u8],
-) -> usize {
-    debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes");
-    let num_children = child_chaining_values.len() / OUT_LEN;
-    debug_assert!(num_children >= 2, "not enough children");
-    debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many");
-
-    let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN);
-    // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of
-    // the requirements of compress_subtree_wide().
-    let mut parents_array = ArrayVec::<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE_OR_2>::new();
-    for parent in &mut parents_exact {
-        parents_array.push(array_ref!(parent, 0, BLOCK_LEN));
-    }
-    platform.hash_many(
-        &parents_array,
-        key,
-        0, // Parents always use counter 0.
-        IncrementCounter::No,
-        flags | PARENT,
-        0, // Parents have no start flags.
-        0, // Parents have no end flags.
-        out,
-    );
-
-    // If there's an odd child left over, it becomes an output.
-    let parents_so_far = parents_array.len();
-    if !parents_exact.remainder().is_empty() {
-        out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder());
-        parents_so_far + 1
-    } else {
-        parents_so_far
-    }
-}
-
 // The wide helper function returns (writes out) an array of chaining values
 // and returns the length of that array. The number of chaining values returned
 // is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
@@ -707,66 +483,41 @@ fn compress_parents_parallel(
 // multithreading parallelism for that update().
 fn compress_subtree_wide<J: join::Join>(
     input: &[u8],
-    key: &CVWords,
+    key: &CVBytes,
     chunk_counter: u64,
-    flags: u8,
-    platform: Platform,
-    out: &mut [u8],
+    flags: u32,
+    out: guts::TransposedSplit,
 ) -> usize {
     // Note that the single chunk case does *not* bump the SIMD degree up to 2
     // when it is 1. This allows Rayon the option of multithreading even the
     // 2-chunk case, which can help performance on smaller platforms.
-    if input.len() <= platform.simd_degree() * CHUNK_LEN {
-        return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out);
+    let degree = guts::DETECTED_IMPL.degree();
+    if input.len() <= degree * CHUNK_LEN {
+        return guts::DETECTED_IMPL.hash_chunks(input, key, chunk_counter, flags, out);
     }
 
     // With more than simd_degree chunks, we need to recurse. Start by dividing
     // the input into left and right subtrees. (Note that this is only optimal
     // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
     // of 3 or something, we'll need a more complicated strategy.)
-    debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2");
-    let (left, right) = input.split_at(left_len(input.len()));
+    debug_assert_eq!(degree.count_ones(), 1, "power of 2");
+    let (left, right) = input.split_at(guts::left_len(input.len()));
     let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64;
 
-    // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
-    // account for the special case of returning 2 outputs when the SIMD degree
-    // is 1.
-    let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN];
-    let degree = if left.len() == CHUNK_LEN {
-        // The "simd_degree=1 and we're at the leaf nodes" case.
-        debug_assert_eq!(platform.simd_degree(), 1);
-        1
-    } else {
-        cmp::max(platform.simd_degree(), 2)
-    };
-    let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN);
+    let mut transposed_cvs = guts::TransposedVectors::new();
+    let (left_cvs, right_cvs) = guts::DETECTED_IMPL.split_transposed_vectors(&mut transposed_cvs);
 
     // Recurse! For update_rayon(), this is where we take advantage of RayonJoin and use multiple
     // threads.
     let (left_n, right_n) = J::join(
-        || compress_subtree_wide::<J>(left, key, chunk_counter, flags, platform, left_out),
-        || compress_subtree_wide::<J>(right, key, right_chunk_counter, flags, platform, right_out),
+        || compress_subtree_wide::<J>(left, key, chunk_counter, flags, left_cvs),
+        || compress_subtree_wide::<J>(right, key, right_chunk_counter, flags, right_cvs),
     );
 
-    // The special case again. If simd_degree=1, then we'll have left_n=1 and
-    // right_n=1. Rather than compressing them into a single output, return
-    // them directly, to make sure we always have at least two outputs.
-    debug_assert_eq!(left_n, degree);
-    debug_assert!(right_n >= 1 && right_n <= left_n);
-    if left_n == 1 {
-        out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]);
-        return 2;
-    }
-
-    // Otherwise, do one layer of parent node compression.
-    let num_children = left_n + right_n;
-    compress_parents_parallel(
-        &cv_array[..num_children * OUT_LEN],
-        key,
-        flags,
-        platform,
-        out,
-    )
+    // Do one layer of parent node compression. The SIMD degree is always at least 2, so we're
+    // guaranteed that this isn't the root compression.
+    let num_cvs = left_n + right_n;
+    guts::DETECTED_IMPL.hash_parents(&mut transposed_cvs, num_cvs, key, flags, out)
 }
 
 // Hash a subtree with compress_subtree_wide(), and then condense the resulting
@@ -781,50 +532,41 @@ fn compress_subtree_wide<J: join::Join>(
 // chunk or less. That's a different codepath.
 fn compress_subtree_to_parent_node<J: join::Join>(
     input: &[u8],
-    key: &CVWords,
+    key: &CVBytes,
     chunk_counter: u64,
-    flags: u8,
-    platform: Platform,
-) -> [u8; BLOCK_LEN] {
+    flags: u32,
+) -> BlockBytes {
     debug_assert!(input.len() > CHUNK_LEN);
-    let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN];
-    let mut num_cvs =
-        compress_subtree_wide::<J>(input, &key, chunk_counter, flags, platform, &mut cv_array);
+    let mut transposed_cvs = guts::TransposedVectors::new();
+    let (left_cvs, _) = guts::DETECTED_IMPL.split_transposed_vectors(&mut transposed_cvs);
+    let mut num_cvs = compress_subtree_wide::<J>(input, &key, chunk_counter, flags, left_cvs);
     debug_assert!(num_cvs >= 2);
 
     // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
     // compress_subtree_wide() returns more than 2 chaining values. Condense
     // them into 2 by forming parent nodes repeatedly.
-    let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2];
     while num_cvs > 2 {
-        let cv_slice = &cv_array[..num_cvs * OUT_LEN];
-        num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array);
-        cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]);
+        num_cvs = guts::DETECTED_IMPL.reduce_parents(&mut transposed_cvs, num_cvs, key, flags);
     }
-    *array_ref!(cv_array, 0, 2 * OUT_LEN)
+    transposed_cvs.extract_parent_node(0)
 }
 
 // Hash a complete input all at once. Unlike compress_subtree_wide() and
 // compress_subtree_to_parent_node(), this function handles the 1 chunk case.
-fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVWords, flags: u8) -> Output {
-    let platform = Platform::detect();
-
+fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVBytes, flags: u32) -> Output {
     // If the whole subtree is one chunk, hash it directly with a ChunkState.
     if input.len() <= CHUNK_LEN {
-        return ChunkState::new(key, 0, flags, platform)
-            .update(input)
-            .output();
+        return ChunkState::new(key, 0, flags).update(input).output();
     }
 
     // Otherwise construct an Output object from the parent node returned by
     // compress_subtree_to_parent_node().
     Output {
         input_chaining_value: *key,
-        block: compress_subtree_to_parent_node::<J>(input, key, 0, flags, platform),
+        block: compress_subtree_to_parent_node::<J>(input, key, 0, flags),
         block_len: BLOCK_LEN as u8,
         counter: 0,
-        flags: flags | PARENT,
-        platform,
+        flags: flags as u8 | PARENT as u8,
     }
 }
 
@@ -839,7 +581,7 @@ fn hash_all_at_once<J: join::Join>(input: &[u8], key: &CVWords, flags: u8) -> Ou
 /// This function is always single-threaded. For multithreading support, see
 /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
 pub fn hash(input: &[u8]) -> Hash {
-    hash_all_at_once::<join::SerialJoin>(input, IV, 0).root_hash()
+    hash_all_at_once::<join::SerialJoin>(input, &IV_BYTES, 0).root_hash()
 }
 
 /// The keyed hash function.
@@ -856,9 +598,8 @@ pub fn hash(input: &[u8]) -> Hash {
 /// This function is always single-threaded. For multithreading support, see
 /// [`Hasher::new_keyed`] and
 /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
-pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
-    let key_words = platform::words_from_le_bytes_32(key);
-    hash_all_at_once::<join::SerialJoin>(input, &key_words, KEYED_HASH).root_hash()
+pub fn keyed_hash(key: &CVBytes, input: &[u8]) -> Hash {
+    hash_all_at_once::<join::SerialJoin>(input, key, KEYED_HASH).root_hash()
 }
 
 /// The key derivation function.
@@ -896,12 +637,11 @@ pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash {
 /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon).
 ///
 /// [Argon2]: https://en.wikipedia.org/wiki/Argon2
-pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] {
+pub fn derive_key(context: &str, key_material: &[u8]) -> CVBytes {
     let context_key =
-        hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
+        hash_all_at_once::<join::SerialJoin>(context.as_bytes(), &IV_BYTES, DERIVE_KEY_CONTEXT)
             .root_hash();
-    let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
-    hash_all_at_once::<join::SerialJoin>(key_material, &context_key_words, DERIVE_KEY_MATERIAL)
+    hash_all_at_once::<join::SerialJoin>(key_material, context_key.as_bytes(), DERIVE_KEY_MATERIAL)
         .root_hash()
         .0
 }
@@ -909,9 +649,8 @@ pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] {
 fn parent_node_output(
     left_child: &CVBytes,
     right_child: &CVBytes,
-    key: &CVWords,
-    flags: u8,
-    platform: Platform,
+    key: &CVBytes,
+    flags: u32,
 ) -> Output {
     let mut block = [0; BLOCK_LEN];
     block[..32].copy_from_slice(left_child);
@@ -921,8 +660,7 @@ fn parent_node_output(
         block,
         block_len: BLOCK_LEN as u8,
         counter: 0,
-        flags: flags | PARENT,
-        platform,
+        flags: (flags | PARENT) as u8,
     }
 }
 
@@ -963,7 +701,7 @@ fn parent_node_output(
 #[derive(Clone)]
 #[cfg_attr(feature = "zeroize", derive(zeroize::Zeroize))]
 pub struct Hasher {
-    key: CVWords,
+    key: CVBytes,
     chunk_state: ChunkState,
     // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
     // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
@@ -974,26 +712,25 @@ pub struct Hasher {
 }
 
 impl Hasher {
-    fn new_internal(key: &CVWords, flags: u8) -> Self {
+    fn new_internal(key: &CVBytes, flags: u32) -> Self {
         Self {
             key: *key,
-            chunk_state: ChunkState::new(key, 0, flags, Platform::detect()),
+            chunk_state: ChunkState::new(key, 0, flags),
             cv_stack: ArrayVec::new(),
         }
     }
 
     /// Construct a new `Hasher` for the regular hash function.
     pub fn new() -> Self {
-        Self::new_internal(IV, 0)
+        Self::new_internal(&IV_BYTES, 0)
     }
 
     /// Construct a new `Hasher` for the keyed hash function. See
     /// [`keyed_hash`].
     ///
     /// [`keyed_hash`]: fn.keyed_hash.html
-    pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self {
-        let key_words = platform::words_from_le_bytes_32(key);
-        Self::new_internal(&key_words, KEYED_HASH)
+    pub fn new_keyed(key: &CVBytes) -> Self {
+        Self::new_internal(key, KEYED_HASH)
     }
 
     /// Construct a new `Hasher` for the key derivation function. See
@@ -1003,10 +740,9 @@ impl Hasher {
     /// [`derive_key`]: fn.derive_key.html
     pub fn new_derive_key(context: &str) -> Self {
         let context_key =
-            hash_all_at_once::<join::SerialJoin>(context.as_bytes(), IV, DERIVE_KEY_CONTEXT)
+            hash_all_at_once::<join::SerialJoin>(context.as_bytes(), &IV_BYTES, DERIVE_KEY_CONTEXT)
                 .root_hash();
-        let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes());
-        Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL)
+        Self::new_internal(context_key.as_bytes(), DERIVE_KEY_MATERIAL)
     }
 
     /// Reset the `Hasher` to its initial state.
@@ -1014,12 +750,7 @@ impl Hasher {
     /// This is functionally the same as overwriting the `Hasher` with a new
     /// one, using the same key or context string if any.
     pub fn reset(&mut self) -> &mut Self {
-        self.chunk_state = ChunkState::new(
-            &self.key,
-            0,
-            self.chunk_state.flags,
-            self.chunk_state.platform,
-        );
+        self.chunk_state = ChunkState::new(&self.key, 0, self.chunk_state.flags as u32);
         self.cv_stack.clear();
         self
     }
@@ -1044,8 +775,7 @@ impl Hasher {
                 &left_child,
                 &right_child,
                 &self.key,
-                self.chunk_state.flags,
-                self.chunk_state.platform,
+                self.chunk_state.flags as u32,
             );
             self.cv_stack.push(parent_output.chaining_value());
         }
@@ -1118,8 +848,7 @@ impl Hasher {
                 self.chunk_state = ChunkState::new(
                     &self.key,
                     self.chunk_state.chunk_counter + 1,
-                    self.chunk_state.flags,
-                    self.chunk_state.platform,
+                    self.chunk_state.flags as u32,
                 );
             } else {
                 return self;
@@ -1142,7 +871,7 @@ impl Hasher {
         while input.len() > CHUNK_LEN {
             debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data");
             debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len");
-            let mut subtree_len = largest_power_of_two_leq(input.len());
+            let mut subtree_len = guts::largest_power_of_two_leq(input.len());
             let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64;
             // Shrink the subtree_len until it evenly divides the count so far.
             // We know that subtree_len itself is a power of 2, so we can use a
@@ -1174,8 +903,7 @@ impl Hasher {
                     &ChunkState::new(
                         &self.key,
                         self.chunk_state.chunk_counter,
-                        self.chunk_state.flags,
-                        self.chunk_state.platform,
+                        self.chunk_state.flags as u32,
                     )
                     .update(&input[..subtree_len])
                     .output()
@@ -1189,11 +917,10 @@ impl Hasher {
                     &input[..subtree_len],
                     &self.key,
                     self.chunk_state.chunk_counter,
-                    self.chunk_state.flags,
-                    self.chunk_state.platform,
+                    self.chunk_state.flags as u32,
                 );
-                let left_cv = array_ref!(cv_pair, 0, 32);
-                let right_cv = array_ref!(cv_pair, 32, 32);
+                let left_cv = cv_pair[..32].try_into().unwrap();
+                let right_cv = cv_pair[32..].try_into().unwrap();
                 // Push the two CVs we received into the CV stack in order. Because
                 // the stack merges lazily, this guarantees we aren't merging the
                 // root.
@@ -1256,8 +983,7 @@ impl Hasher {
                 &self.cv_stack[num_cvs_remaining - 2],
                 &self.cv_stack[num_cvs_remaining - 1],
                 &self.key,
-                self.chunk_state.flags,
-                self.chunk_state.platform,
+                self.chunk_state.flags as u32,
             );
             num_cvs_remaining -= 2;
         }
@@ -1266,8 +992,7 @@ impl Hasher {
                 &self.cv_stack[num_cvs_remaining - 1],
                 &output.chaining_value(),
                 &self.key,
-                self.chunk_state.flags,
-                self.chunk_state.platform,
+                self.chunk_state.flags as u32,
             );
             num_cvs_remaining -= 1;
         }
@@ -1481,7 +1206,6 @@ impl fmt::Debug for Hasher {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         f.debug_struct("Hasher")
             .field("flags", &self.chunk_state.flags)
-            .field("platform", &self.chunk_state.platform)
             .finish()
     }
 }
@@ -1546,6 +1270,62 @@ impl OutputReader {
         }
     }
 
+    // There's some nontrivial logic here to handle partial blocks, and I don't want to copy-paste
+    // it between the xof and xof_xor cases.
+    #[inline(always)]
+    fn fill_inner(&mut self, mut buf: &mut [u8], xor: bool) {
+        debug_assert!(self.position_within_block < BLOCK_LEN as u8);
+        let xof_fn = if xor {
+            guts::Implementation::xof_xor
+        } else {
+            guts::Implementation::xof
+        };
+        if self.position_within_block != 0 {
+            // The xof() and xof_xor() APIs can handle a partial block at the end but not a partial
+            // block at the beginning. We handle the beginning case here. Start by computing the
+            // complete block that we need part of.
+            let mut partial_block = [0u8; 64];
+            xof_fn(
+                &guts::DETECTED_IMPL,
+                &self.inner.block,
+                self.inner.block_len as u32,
+                &self.inner.input_chaining_value,
+                self.inner.counter,
+                self.inner.flags as u32,
+                &mut partial_block,
+            );
+            let output_bytes = &partial_block[self.position_within_block as usize..];
+            let take = cmp::min(buf.len(), output_bytes.len());
+            if xor {
+                for byte_index in 0..take {
+                    buf[byte_index] ^= output_bytes[byte_index];
+                }
+            } else {
+                buf[..take].copy_from_slice(&output_bytes[..take]);
+            }
+            buf = &mut buf[take..];
+            self.position_within_block += take as u8;
+            if self.position_within_block == BLOCK_LEN as u8 {
+                self.position_within_block = 0;
+                self.inner.counter += 1;
+            } else {
+                debug_assert!(buf.is_empty());
+                return;
+            }
+        }
+        xof_fn(
+            &guts::DETECTED_IMPL,
+            &self.inner.block,
+            self.inner.block_len as u32,
+            &self.inner.input_chaining_value,
+            self.inner.counter,
+            self.inner.flags as u32,
+            buf,
+        );
+        self.inner.counter += (buf.len() / BLOCK_LEN) as u64;
+        self.position_within_block = (buf.len() % BLOCK_LEN) as u8;
+    }
+
     /// Fill a buffer with output bytes and advance the position of the
     /// `OutputReader`. This is equivalent to [`Read::read`], except that it
     /// doesn't return a `Result`. Both methods always fill the entire buffer.
@@ -1561,19 +1341,12 @@ impl OutputReader {
     /// reading further, the behavior is unspecified.
     ///
     /// [`Read::read`]: #method.read
-    pub fn fill(&mut self, mut buf: &mut [u8]) {
-        while !buf.is_empty() {
-            let block: [u8; BLOCK_LEN] = self.inner.root_output_block();
-            let output_bytes = &block[self.position_within_block as usize..];
-            let take = cmp::min(buf.len(), output_bytes.len());
-            buf[..take].copy_from_slice(&output_bytes[..take]);
-            buf = &mut buf[take..];
-            self.position_within_block += take as u8;
-            if self.position_within_block == BLOCK_LEN as u8 {
-                self.inner.counter += 1;
-                self.position_within_block = 0;
-            }
-        }
+    pub fn fill(&mut self, buf: &mut [u8]) {
+        self.fill_inner(buf, false);
+    }
+
+    pub fn fill_xor(&mut self, buf: &mut [u8]) {
+        self.fill_inner(buf, true);
     }
 
     /// Return the current read position in the output stream. This is
diff --git a/src/portable.rs b/src/portable.rs
index 7af6828b0..b82bd7f21 100644
--- a/src/portable.rs
+++ b/src/portable.rs
@@ -181,10 +181,10 @@ pub fn hash_many<const N: usize>(
 pub mod test {
     use super::*;
 
-    // This is basically testing the portable implementation against itself,
-    // but it also checks that compress_in_place and compress_xof are
-    // consistent. And there are tests against the reference implementation and
-    // against hardcoded test vectors elsewhere.
+    // These are basically testing the portable implementation against itself, but we also check
+    // that compress_in_place and compress_xof are consistent. And there are tests against the
+    // reference implementation and against hardcoded test vectors elsewhere.
+
     #[test]
     fn test_compress() {
         crate::test::test_compress_fn(compress_in_place, compress_xof);
diff --git a/src/test.rs b/src/test.rs
index fb1e8495e..8d5c6ec7c 100644
--- a/src/test.rs
+++ b/src/test.rs
@@ -1,6 +1,7 @@
-use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN};
-use arrayref::array_ref;
-use arrayvec::ArrayVec;
+use blake3_guts as guts;
+use guts::{CVBytes, CVWords, BLOCK_LEN, CHUNK_LEN};
+
+use core::cmp;
 use core::usize;
 use rand::prelude::*;
 
@@ -46,172 +47,12 @@ pub const TEST_CASES: &[usize] = &[
 pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN;
 
 // There's a test to make sure these two are equal below.
-pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend";
-pub const TEST_KEY_WORDS: CVWords = [
-    1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521,
-];
-
-// Paint the input with a repeating byte pattern. We use a cycle length of 251,
-// because that's the largest prime number less than 256. This makes it
-// unlikely to swapping any two adjacent input blocks or chunks will give the
-// same answer.
-pub fn paint_test_input(buf: &mut [u8]) {
-    for (i, b) in buf.iter_mut().enumerate() {
-        *b = (i % 251) as u8;
-    }
-}
-
-type CompressInPlaceFn =
-    unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8);
-
-type CompressXofFn = unsafe fn(
-    cv: &CVWords,
-    block: &[u8; BLOCK_LEN],
-    block_len: u8,
-    counter: u64,
-    flags: u8,
-) -> [u8; 64];
-
-// A shared helper function for platform-specific tests.
-pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) {
-    let initial_state = TEST_KEY_WORDS;
-    let block_len: u8 = 61;
-    let mut block = [0; BLOCK_LEN];
-    paint_test_input(&mut block[..block_len as usize]);
-    // Use a counter with set bits in both 32-bit words.
-    let counter = (5u64 << 32) + 6;
-    let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH;
-
-    let portable_out =
-        crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags);
-
-    let mut test_state = initial_state;
-    unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) };
-    let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state);
-    let test_xof =
-        unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) };
-
-    assert_eq!(&portable_out[..32], &test_state_bytes[..]);
-    assert_eq!(&portable_out[..], &test_xof[..]);
-}
-
-type HashManyFn<A> = unsafe fn(
-    inputs: &[&A],
-    key: &CVWords,
-    counter: u64,
-    increment_counter: IncrementCounter,
-    flags: u8,
-    flags_start: u8,
-    flags_end: u8,
-    out: &mut [u8],
-);
-
-// A shared helper function for platform-specific tests.
-pub fn test_hash_many_fn(
-    hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>,
-    hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>,
-) {
-    // Test a few different initial counter values.
-    // - 0: The base case.
-    // - u32::MAX: The low word of the counter overflows for all inputs except the first.
-    // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR
-    //   when you're supposed to ANDNOT...
-    let initial_counters = [0, u32::MAX as u64, i32::MAX as u64];
-    for counter in initial_counters {
-        #[cfg(feature = "std")]
-        dbg!(counter);
-
-        // 31 (16 + 8 + 4 + 2 + 1) inputs
-        const NUM_INPUTS: usize = 31;
-        let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS];
-        crate::test::paint_test_input(&mut input_buf);
-
-        // First hash chunks.
-        let mut chunks = ArrayVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new();
-        for i in 0..NUM_INPUTS {
-            chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN));
-        }
-        let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN];
-        crate::portable::hash_many(
-            &chunks,
-            &TEST_KEY_WORDS,
-            counter,
-            IncrementCounter::Yes,
-            crate::KEYED_HASH,
-            crate::CHUNK_START,
-            crate::CHUNK_END,
-            &mut portable_chunks_out,
-        );
-
-        let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN];
-        unsafe {
-            hash_many_chunks_fn(
-                &chunks[..],
-                &TEST_KEY_WORDS,
-                counter,
-                IncrementCounter::Yes,
-                crate::KEYED_HASH,
-                crate::CHUNK_START,
-                crate::CHUNK_END,
-                &mut test_chunks_out,
-            );
-        }
-        for n in 0..NUM_INPUTS {
-            #[cfg(feature = "std")]
-            dbg!(n);
-            assert_eq!(
-                &portable_chunks_out[n * OUT_LEN..][..OUT_LEN],
-                &test_chunks_out[n * OUT_LEN..][..OUT_LEN]
-            );
-        }
-
-        // Then hash parents.
-        let mut parents = ArrayVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new();
-        for i in 0..NUM_INPUTS {
-            parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN));
-        }
-        let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN];
-        crate::portable::hash_many(
-            &parents,
-            &TEST_KEY_WORDS,
-            counter,
-            IncrementCounter::No,
-            crate::KEYED_HASH | crate::PARENT,
-            0,
-            0,
-            &mut portable_parents_out,
-        );
-
-        let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN];
-        unsafe {
-            hash_many_parents_fn(
-                &parents[..],
-                &TEST_KEY_WORDS,
-                counter,
-                IncrementCounter::No,
-                crate::KEYED_HASH | crate::PARENT,
-                0,
-                0,
-                &mut test_parents_out,
-            );
-        }
-        for n in 0..NUM_INPUTS {
-            #[cfg(feature = "std")]
-            dbg!(n);
-            assert_eq!(
-                &portable_parents_out[n * OUT_LEN..][..OUT_LEN],
-                &test_parents_out[n * OUT_LEN..][..OUT_LEN]
-            );
-        }
-    }
-}
+pub const TEST_KEY: &CVBytes = b"whats the Elvish word for friend";
+pub const TEST_KEY_WORDS: &CVWords = &guts::words_from_le_bytes_32(TEST_KEY);
 
 #[test]
 fn test_key_bytes_equal_key_words() {
-    assert_eq!(
-        TEST_KEY_WORDS,
-        crate::platform::words_from_le_bytes_32(&TEST_KEY),
-    );
+    assert_eq!(TEST_KEY, &guts::le_bytes_from_words_32(TEST_KEY_WORDS),);
 }
 
 #[test]
@@ -224,52 +65,9 @@ fn test_reference_impl_size() {
     assert_eq!(1880, core::mem::size_of::<reference_impl::Hasher>());
 }
 
-#[test]
-fn test_counter_words() {
-    let counter: u64 = (1 << 32) + 2;
-    assert_eq!(crate::counter_low(counter), 2);
-    assert_eq!(crate::counter_high(counter), 1);
-}
-
-#[test]
-fn test_largest_power_of_two_leq() {
-    let input_output = &[
-        // The zero case is nonsensical, but it does work.
-        (0, 1),
-        (1, 1),
-        (2, 2),
-        (3, 2),
-        (4, 4),
-        (5, 4),
-        (6, 4),
-        (7, 4),
-        (8, 8),
-        // the largest possible usize
-        (usize::MAX, (usize::MAX >> 1) + 1),
-    ];
-    for &(input, output) in input_output {
-        assert_eq!(
-            output,
-            crate::largest_power_of_two_leq(input),
-            "wrong output for n={}",
-            input
-        );
-    }
-}
-
-#[test]
-fn test_left_len() {
-    let input_output = &[
-        (CHUNK_LEN + 1, CHUNK_LEN),
-        (2 * CHUNK_LEN - 1, CHUNK_LEN),
-        (2 * CHUNK_LEN, CHUNK_LEN),
-        (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN),
-        (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN),
-        (4 * CHUNK_LEN, 2 * CHUNK_LEN),
-        (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN),
-    ];
-    for &(input, output) in input_output {
-        assert_eq!(crate::left_len(input), output);
+pub(crate) fn paint_test_input(buf: &mut [u8]) {
+    for (i, b) in buf.iter_mut().enumerate() {
+        *b = (i % 251) as u8;
     }
 }
 
@@ -292,18 +90,18 @@ fn test_compare_reference_impl() {
 
             // all at once
             let test_out = crate::hash(input);
-            assert_eq!(test_out, *array_ref!(expected_out, 0, 32));
+            assert_eq!(test_out, expected_out[..32]);
             // incremental
             let mut hasher = crate::Hasher::new();
             hasher.update(input);
-            assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+            assert_eq!(hasher.finalize(), expected_out[..32]);
             assert_eq!(hasher.finalize(), test_out);
             // incremental (rayon)
             #[cfg(feature = "rayon")]
             {
                 let mut hasher = crate::Hasher::new();
                 hasher.update_rayon(input);
-                assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+                assert_eq!(hasher.finalize(), expected_out[..32]);
                 assert_eq!(hasher.finalize(), test_out);
             }
             // xof
@@ -314,25 +112,25 @@ fn test_compare_reference_impl() {
 
         // keyed
         {
-            let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY);
+            let mut reference_hasher = reference_impl::Hasher::new_keyed(TEST_KEY);
             reference_hasher.update(input);
             let mut expected_out = [0; OUT];
             reference_hasher.finalize(&mut expected_out);
 
             // all at once
-            let test_out = crate::keyed_hash(&TEST_KEY, input);
-            assert_eq!(test_out, *array_ref!(expected_out, 0, 32));
+            let test_out = crate::keyed_hash(TEST_KEY, input);
+            assert_eq!(test_out, expected_out[..32]);
             // incremental
-            let mut hasher = crate::Hasher::new_keyed(&TEST_KEY);
+            let mut hasher = crate::Hasher::new_keyed(TEST_KEY);
             hasher.update(input);
-            assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+            assert_eq!(hasher.finalize(), expected_out[..32]);
             assert_eq!(hasher.finalize(), test_out);
             // incremental (rayon)
             #[cfg(feature = "rayon")]
             {
-                let mut hasher = crate::Hasher::new_keyed(&TEST_KEY);
+                let mut hasher = crate::Hasher::new_keyed(TEST_KEY);
                 hasher.update_rayon(input);
-                assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
+                assert_eq!(hasher.finalize(), expected_out[..32]);
                 assert_eq!(hasher.finalize(), test_out);
             }
             // xof
@@ -355,15 +153,15 @@ fn test_compare_reference_impl() {
             // incremental
             let mut hasher = crate::Hasher::new_derive_key(context);
             hasher.update(input);
-            assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
-            assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32));
+            assert_eq!(hasher.finalize(), expected_out[..32]);
+            assert_eq!(hasher.finalize(), test_out[..32]);
             // incremental (rayon)
             #[cfg(feature = "rayon")]
             {
                 let mut hasher = crate::Hasher::new_derive_key(context);
                 hasher.update_rayon(input);
-                assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32));
-                assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32));
+                assert_eq!(hasher.finalize(), expected_out[..32]);
+                assert_eq!(hasher.finalize(), test_out[..32]);
             }
             // xof
             let mut extended = [0; OUT];
@@ -423,9 +221,9 @@ fn test_fuzz_hasher() {
     let mut input_buf = [0; 3 * INPUT_MAX];
     paint_test_input(&mut input_buf);
 
-    // Don't do too many iterations in debug mode, to keep the tests under a
-    // second or so. CI should run tests in release mode also. Provide an
-    // environment variable for specifying a larger number of fuzz iterations.
+    // Don't do too many iterations in debug mode, to keep the tests under a second or so. CI
+    // should run tests in release mode also.
+    // TODO: Provide an environment variable for specifying a larger number of fuzz iterations?
     let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 };
 
     // Use a fixed RNG seed for reproducibility.
@@ -493,6 +291,133 @@ fn test_xof_seek() {
     }
 }
 
+#[test]
+fn test_xof_xor() {
+    for step in [32, 63, 64, 128, 303] {
+        #[cfg(feature = "std")]
+        dbg!(step);
+        let mut ref_hasher = reference_impl::Hasher::new();
+        ref_hasher.update(b"foo");
+        let mut ref_output = [0u8; 1000];
+        ref_hasher.finalize(&mut ref_output);
+
+        let mut hasher = crate::Hasher::new();
+        hasher.update(b"foo");
+        let mut reader = hasher.finalize_xof();
+
+        let mut test_output = [0u8; 1000];
+        for chunk in test_output.chunks_mut(step) {
+            reader.fill(chunk);
+        }
+        assert_eq!(ref_output, test_output);
+        // Xor'ing the same output should zero the buffer.
+        reader.set_position(0);
+        for chunk in test_output.chunks_mut(step) {
+            reader.fill_xor(chunk);
+        }
+        assert_eq!([0u8; 1000], test_output);
+        // Xor'ing the same output again should reproduce the original.
+        reader.set_position(0);
+        for chunk in test_output.chunks_mut(step) {
+            reader.fill_xor(chunk);
+        }
+        assert_eq!(ref_output, test_output);
+
+        // Repeat the same test but starting at offset 500.
+        reader.set_position(500);
+        for chunk in test_output[..500].chunks_mut(step) {
+            reader.fill(chunk);
+        }
+        assert_eq!(ref_output[500..], test_output[..500]);
+        reader.set_position(500);
+        for chunk in test_output[..500].chunks_mut(step) {
+            reader.fill_xor(chunk);
+        }
+        assert_eq!([0u8; 500], test_output[..500]);
+        reader.set_position(500);
+        for chunk in test_output[..500].chunks_mut(step) {
+            reader.fill_xor(chunk);
+        }
+        assert_eq!(ref_output[500..], test_output[..500]);
+    }
+}
+
+#[test]
+#[cfg(feature = "std")]
+fn test_fuzz_xof() {
+    // Use a fixed RNG seed for reproducibility.
+    let mut rng = rand_chacha::ChaCha8Rng::from_seed([99; 32]);
+    let random_key: [u8; 32] = rng.gen();
+
+    let possible_seeks = [-64i64, -63 - 1, 0, 1, 63, 64, 127, 128, 129];
+
+    const MAX_LEN: usize = 1100;
+    let possible_lengths = [0usize, 1, 63, 64, 65, 128, 256, 512, 1024, MAX_LEN];
+    assert!(possible_lengths.into_iter().all(|x| x <= MAX_LEN));
+
+    let mut xof_output = crate::Hasher::new_keyed(&random_key).finalize_xof();
+    let mut xof_xor_output = crate::Hasher::new_keyed(&random_key).finalize_xof();
+
+    // Don't do too many iterations in debug mode, to keep the tests under a second or so. CI
+    // should run tests in release mode also.
+    // TODO: Provide an environment variable for specifying a larger number of fuzz iterations?
+    let num_tests = if cfg!(debug_assertions) {
+        1_000
+    } else {
+        100_000
+    };
+
+    let mut position = 0;
+    let mut ref_output = Vec::new();
+    for test_i in 0..num_tests {
+        eprintln!("--- test {test_i} ---");
+        // Do a random relative seek maybe. Could be zero.
+        let relative_seek: i64 = *possible_seeks.choose(&mut rng).unwrap();
+        dbg!(relative_seek);
+        if relative_seek != 0 {
+            let new_position = position as i64 + relative_seek;
+            if 0 <= new_position && new_position <= MAX_LEN as i64 {
+                position = new_position as u64;
+            } else {
+                position = 0;
+            }
+            assert_eq!(xof_output.position(), xof_xor_output.position());
+            xof_output.set_position(position as u64);
+            xof_xor_output.set_position(position as u64);
+        }
+        dbg!(position);
+
+        // Generate a random number of output bytes. If the amount of output we've gotten from the
+        // reference_impl isn't enough, double it.
+        let len: usize = *possible_lengths.choose(&mut rng).unwrap();
+        dbg!(len);
+        if position as usize + len > ref_output.len() {
+            let new_len = cmp::max(MAX_LEN, 2 * ref_output.len());
+            ref_output = vec![0u8; new_len];
+            eprintln!("grow reference output length to {}", ref_output.len());
+            let ref_hasher = reference_impl::Hasher::new_keyed(&random_key);
+            ref_hasher.finalize(&mut ref_output);
+        }
+        let mut buf = [0u8; MAX_LEN];
+        xof_output.fill(&mut buf[..len]);
+        assert_eq!(ref_output[position as usize..][..len], buf[..len]);
+        assert_eq!([0u8; MAX_LEN][..MAX_LEN - len], buf[len..]);
+
+        // Xor over the output with a random byte value, and then confirm that xof_xor() recovers
+        // that value.
+        let random_byte: u8 = rng.gen();
+        dbg!(random_byte);
+        for i in 0..len {
+            buf[i] ^= random_byte;
+        }
+        xof_xor_output.fill_xor(&mut buf[..len]);
+        assert_eq!([random_byte; MAX_LEN][..len], buf[..len]);
+        assert_eq!([0u8; MAX_LEN][..MAX_LEN - len], buf[len..]);
+
+        position += len as u64;
+    }
+}
+
 #[test]
 fn test_msg_schedule_permutation() {
     let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8];
@@ -506,7 +431,7 @@ fn test_msg_schedule_permutation() {
         }
     }
 
-    assert_eq!(generated, crate::MSG_SCHEDULE);
+    assert_eq!(generated, guts::MSG_SCHEDULE);
 }
 
 #[test]
@@ -640,53 +565,43 @@ fn test_zeroize() {
 
     let mut hasher = crate::Hasher {
         chunk_state: crate::ChunkState {
-            cv: [42; 8],
+            cv: [42; 32],
             chunk_counter: 42,
             buf: [42; 64],
             buf_len: 42,
             blocks_compressed: 42,
             flags: 42,
-            platform: crate::Platform::Portable,
         },
-        key: [42; 8],
+        key: [42; 32],
         cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(),
     };
     hasher.zeroize();
-    assert_eq!(hasher.chunk_state.cv, [0; 8]);
+    assert_eq!(hasher.chunk_state.cv, [0; 32]);
     assert_eq!(hasher.chunk_state.chunk_counter, 0);
     assert_eq!(hasher.chunk_state.buf, [0; 64]);
     assert_eq!(hasher.chunk_state.buf_len, 0);
     assert_eq!(hasher.chunk_state.blocks_compressed, 0);
     assert_eq!(hasher.chunk_state.flags, 0);
-    assert!(matches!(
-        hasher.chunk_state.platform,
-        crate::Platform::Portable
-    ));
-    assert_eq!(hasher.key, [0; 8]);
+    assert_eq!(hasher.key, [0; 32]);
     assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]);
 
     let mut output_reader = crate::OutputReader {
         inner: crate::Output {
-            input_chaining_value: [42; 8],
+            input_chaining_value: [42; 32],
             block: [42; 64],
             counter: 42,
             block_len: 42,
             flags: 42,
-            platform: crate::Platform::Portable,
         },
         position_within_block: 42,
     };
 
     output_reader.zeroize();
-    assert_eq!(output_reader.inner.input_chaining_value, [0; 8]);
+    assert_eq!(output_reader.inner.input_chaining_value, [0; 32]);
     assert_eq!(output_reader.inner.block, [0; 64]);
     assert_eq!(output_reader.inner.counter, 0);
     assert_eq!(output_reader.inner.block_len, 0);
     assert_eq!(output_reader.inner.flags, 0);
-    assert!(matches!(
-        output_reader.inner.platform,
-        crate::Platform::Portable
-    ));
     assert_eq!(output_reader.position_within_block, 0);
 }
 
diff --git a/test_vectors/Cargo.toml b/test_vectors/Cargo.toml
index 87a9eba19..21098d41d 100644
--- a/test_vectors/Cargo.toml
+++ b/test_vectors/Cargo.toml
@@ -12,6 +12,7 @@ pure = ["blake3/pure"]
 # If you ever change these path dependencies, you'll probably need to update
 # cross_test.sh, or CI will break. I'm sorry >.<
 blake3 = { path = "../" }
+blake3_guts = { path = "../rust/guts" }
 hex = "0.4.0"
 reference_impl = { path = "../reference_impl" }
 serde = { version = "1.0", features = ["derive"] }
diff --git a/test_vectors/src/lib.rs b/test_vectors/src/lib.rs
index 6a4c79846..ea2491a92 100644
--- a/test_vectors/src/lib.rs
+++ b/test_vectors/src/lib.rs
@@ -1,4 +1,4 @@
-use blake3::guts::{BLOCK_LEN, CHUNK_LEN};
+use blake3_guts::{BLOCK_LEN, CHUNK_LEN};
 use serde::{Deserialize, Serialize};
 
 // A non-multiple of 4 is important, since one possible bug is to fail to emit