From c6decdf36f64e4c651edcf60e12527acfb9d9c9c Mon Sep 17 00:00:00 2001 From: hev Date: Tue, 26 Sep 2023 21:06:49 +0800 Subject: [PATCH] sha2: Add inline-asm backend for LoongArch64 targets (#507) --- .github/workflows/sha2.yml | 12 ++ Cargo.lock | 2 +- sha2/CHANGELOG.md | 6 + sha2/Cargo.toml | 5 +- sha2/src/sha256.rs | 3 + sha2/src/sha256/loongarch64_asm.rs | 227 +++++++++++++++++++++++++++ sha2/src/sha512.rs | 3 + sha2/src/sha512/loongarch64_asm.rs | 242 +++++++++++++++++++++++++++++ 8 files changed, 498 insertions(+), 2 deletions(-) create mode 100644 sha2/src/sha256/loongarch64_asm.rs create mode 100644 sha2/src/sha512/loongarch64_asm.rs diff --git a/.github/workflows/sha2.yml b/.github/workflows/sha2.yml index 17ad938e6..1ce8ac3ae 100644 --- a/.github/workflows/sha2.yml +++ b/.github/workflows/sha2.yml @@ -121,6 +121,18 @@ jobs: - uses: msys2/setup-msys2@v2 - run: cargo test --target ${{ matrix.target }} + # Build-only test of the LoongArch64 assembly backend + loongarch64_asm: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: RustCrypto/actions/cargo-cache@master + - uses: dtolnay/rust-toolchain@master + with: + toolchain: 1.72 + targets: loongarch64-unknown-linux-gnu + - run: cargo build --target loongarch64-unknown-linux-gnu --features loongarch64_asm + # Cross-compiled tests cross: needs: set-msrv diff --git a/Cargo.lock b/Cargo.lock index 0afe260d6..11d4f3dbb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -218,7 +218,7 @@ dependencies = [ [[package]] name = "sha2" -version = "0.10.7" +version = "0.10.8" dependencies = [ "cfg-if", "cpufeatures", diff --git a/sha2/CHANGELOG.md b/sha2/CHANGELOG.md index a552266e5..a5182bcad 100644 --- a/sha2/CHANGELOG.md +++ b/sha2/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## 0.10.8 (2023-09-26) +### Added +- `asm!`-based backend for LoongArch64 targets gated behind `loongarch64_asm` feature [#507] + +[#507]: https://github.com/RustCrypto/hashes/pull/507 + ## 0.10.7 (2023-06-15) ### Added - AArch64 Neon-based backend ([#490]) diff --git a/sha2/Cargo.toml b/sha2/Cargo.toml index c25251db8..a3dafeaa2 100644 --- a/sha2/Cargo.toml +++ b/sha2/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sha2" -version = "0.10.7" +version = "0.10.8" description = """ Pure Rust implementation of the SHA-2 hash function family including SHA-224, SHA-256, SHA-384, and SHA-512. @@ -31,6 +31,9 @@ default = ["std"] std = ["digest/std"] oid = ["digest/oid"] # Enable OID support. WARNING: Bumps MSRV to 1.57 asm = ["sha2-asm"] # WARNING: this feature SHOULD NOT be enabled by library crates +# Use assembly backend for LoongArch64 targets +# WARNING: Bumps MSRV to 1.72. This feature SHOULD NOT be enabled by library crates +loongarch64_asm = [] compress = [] # Expose compress functions force-soft = [] # Force software implementation asm-aarch64 = ["asm"] # DEPRECATED: use `asm` instead diff --git a/sha2/src/sha256.rs b/sha2/src/sha256.rs index a45331e17..8f8287836 100644 --- a/sha2/src/sha256.rs +++ b/sha2/src/sha256.rs @@ -17,6 +17,9 @@ cfg_if::cfg_if! { mod soft; mod aarch64; use aarch64::compress; + } else if #[cfg(all(feature = "loongarch64_asm", target_arch = "loongarch64"))] { + mod loongarch64_asm; + use loongarch64_asm::compress; } else { mod soft; use soft::compress; diff --git a/sha2/src/sha256/loongarch64_asm.rs b/sha2/src/sha256/loongarch64_asm.rs new file mode 100644 index 000000000..c80fce8bb --- /dev/null +++ b/sha2/src/sha256/loongarch64_asm.rs @@ -0,0 +1,227 @@ +//! LoongArch64 assembly backend + +macro_rules! c { + ($($l:expr)*) => { + concat!($($l ,)*) + }; +} + +macro_rules! rounda { + ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => { + c!( + "ld.w $a5, $a1, (" $i " * 4);" + "revb.2h $a5, $a5;" + "rotri.w $a5, $a5, 16;" + roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h) + ) + }; +} + +macro_rules! roundb { + ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => { + c!( + "ld.w $a4, $sp, (((" $i " - 15) & 0xF) * 4);" + "ld.w $a5, $sp, (((" $i " - 16) & 0xF) * 4);" + "ld.w $a6, $sp, (((" $i " - 7) & 0xF) * 4);" + "add.w $a5, $a5, $a6;" + "rotri.w $a6, $a4, 18;" + "srli.w $a7, $a4, 3;" + "rotri.w $a4, $a4, 7;" + "xor $a6, $a6, $a7;" + "xor $a4, $a4, $a6;" + "add.w $a5, $a5, $a4;" + "ld.w $a4, $sp, (((" $i " - 2) & 0xF) * 4);" + "rotri.w $a6, $a4, 19;" + "srli.w $a7, $a4, 10;" + "rotri.w $a4, $a4, 17;" + "xor $a6, $a6, $a7;" + "xor $a4, $a4, $a6;" + "add.w $a5, $a5, $a4;" + roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h) + ) + }; +} + +macro_rules! roundtail { + ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => { + c!( + // Part 0 + "rotri.w $a6, " $e ", 11;" + "rotri.w $a7, " $e ", 25;" + "rotri.w $a4, " $e ", 6;" + "xor $a6, $a6, $a7;" + "xor $a4, $a4, $a6;" + "xor $a6, " $g ", " $f ";" + "ld.w $a7, $a3, " $i " * 4;" + "and $a6, $a6, " $e ";" + "xor $a6, $a6, " $g ";" + "add.w $a4, $a4, $a6;" + "add.w $a4, $a4, $a7;" + "add.w " $h ", " $h ", $a5;" + "add.w " $h ", " $h ", $a4;" + // Part 1 + "add.w " $d ", " $d ", " $h ";" + // Part 2 + "rotri.w $a6, " $a ", 13;" + "rotri.w $a7, " $a ", 22;" + "rotri.w $a4, " $a ", 2;" + "xor $a6, $a6, $a7;" + "xor $a4, $a4, $a6;" + "add.w " $h ", " $h ", $a4;" + "or $a4, " $c ", " $b ";" + "and $a6, " $c ", " $b ";" + "and $a4, $a4, " $a ";" + "or $a4, $a4, $a6;" + "add.w " $h ", " $h ", $a4;" + "st.w $a5, $sp, ((" $i " & 0xF) * 4);" + ) + }; +} + +pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { + if blocks.is_empty() { + return; + } + + unsafe { + core::arch::asm!( + // Allocate scratch stack space + "addi.d $sp, $sp, -64;", + + // Load state + "ld.w $t0, $a0, 0", + "ld.w $t1, $a0, 4", + "ld.w $t2, $a0, 8", + "ld.w $t3, $a0, 12", + "ld.w $t4, $a0, 16", + "ld.w $t5, $a0, 20", + "ld.w $t6, $a0, 24", + "ld.w $t7, $a0, 28", + + "42:", + + // Do 64 rounds of hashing + rounda!( 0, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + rounda!( 1, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + rounda!( 2, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + rounda!( 3, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + rounda!( 4, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + rounda!( 5, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + rounda!( 6, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + rounda!( 7, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + rounda!( 8, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + rounda!( 9, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + rounda!(10, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + rounda!(11, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + rounda!(12, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + rounda!(13, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + rounda!(14, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + rounda!(15, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(16, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(17, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(18, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(19, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(20, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(21, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(22, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(23, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(24, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(25, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(26, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(27, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(28, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(29, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(30, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(31, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(32, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(33, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(34, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(35, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(36, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(37, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(38, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(39, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(40, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(41, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(42, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(43, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(44, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(45, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(46, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(47, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(48, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(49, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(50, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(51, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(52, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(53, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(54, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(55, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(56, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(57, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(58, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(59, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(60, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(61, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(62, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(63, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + + // Update state registers + "ld.w $a4, $a0, 0", // a + "ld.w $a5, $a0, 4", // b + "ld.w $a6, $a0, 8", // c + "ld.w $a7, $a0, 12", // d + "add.w $t0, $t0, $a4", + "add.w $t1, $t1, $a5", + "add.w $t2, $t2, $a6", + "add.w $t3, $t3, $a7", + "ld.w $a4, $a0, 16", // e + "ld.w $a5, $a0, 20", // f + "ld.w $a6, $a0, 24", // g + "ld.w $a7, $a0, 28", // h + "add.w $t4, $t4, $a4", + "add.w $t5, $t5, $a5", + "add.w $t6, $t6, $a6", + "add.w $t7, $t7, $a7", + + // Save updated state + "st.w $t0, $a0, 0", + "st.w $t1, $a0, 4", + "st.w $t2, $a0, 8", + "st.w $t3, $a0, 12", + "st.w $t4, $a0, 16", + "st.w $t5, $a0, 20", + "st.w $t6, $a0, 24", + "st.w $t7, $a0, 28", + + // Looping over blocks + "addi.d $a1, $a1, 64", + "addi.d $a2, $a2, -1", + "bnez $a2, 42b", + + // Restore stack register + "addi.d $sp, $sp, 64", + + in("$a0") state, + inout("$a1") blocks.as_ptr() => _, + inout("$a2") blocks.len() => _, + in("$a3") crate::consts::K32.as_ptr(), + + // Clobbers + out("$a4") _, + out("$a5") _, + out("$a6") _, + out("$a7") _, + out("$t0") _, + out("$t1") _, + out("$t2") _, + out("$t3") _, + out("$t4") _, + out("$t5") _, + out("$t6") _, + out("$t7") _, + + options(preserves_flags), + ); + } +} diff --git a/sha2/src/sha512.rs b/sha2/src/sha512.rs index af4178c0b..dfe0b454f 100644 --- a/sha2/src/sha512.rs +++ b/sha2/src/sha512.rs @@ -19,6 +19,9 @@ cfg_if::cfg_if! { mod soft; mod aarch64; use aarch64::compress; + } else if #[cfg(all(feature = "loongarch64_asm", target_arch = "loongarch64"))] { + mod loongarch64_asm; + use loongarch64_asm::compress; } else { mod soft; use soft::compress; diff --git a/sha2/src/sha512/loongarch64_asm.rs b/sha2/src/sha512/loongarch64_asm.rs new file mode 100644 index 000000000..557089def --- /dev/null +++ b/sha2/src/sha512/loongarch64_asm.rs @@ -0,0 +1,242 @@ +//! LoongArch64 assembly backend + +macro_rules! c { + ($($l:expr)*) => { + concat!($($l ,)*) + }; +} + +macro_rules! rounda { + ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => { + c!( + "ld.d $a5, $a1, (" $i " * 8);" + "revb.d $a5, $a5;" + roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h) + ) + }; +} + +macro_rules! roundb { + ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => { + c!( + "ld.d $a4, $sp, (((" $i " - 15) & 0xF) * 8);" + "ld.d $a5, $sp, (((" $i " - 16) & 0xF) * 8);" + "ld.d $a6, $sp, (((" $i " - 7) & 0xF) * 8);" + "add.d $a5, $a5, $a6;" + "rotri.d $a6, $a4, 8;" + "srli.d $a7, $a4, 7;" + "rotri.d $a4, $a4, 1;" + "xor $a6, $a6, $a7;" + "xor $a4, $a4, $a6;" + "add.d $a5, $a5, $a4;" + "ld.d $a4, $sp, (((" $i " - 2) & 0xF) * 8);" + "rotri.d $a6, $a4, 61;" + "srli.d $a7, $a4, 6;" + "rotri.d $a4, $a4, 19;" + "xor $a6, $a6, $a7;" + "xor $a4, $a4, $a6;" + "add.d $a5, $a5, $a4;" + roundtail!($i, $a, $b, $c, $d, $e, $f, $g, $h) + ) + }; +} + +macro_rules! roundtail { + ($i:literal, $a:literal, $b:literal, $c:literal, $d:literal, $e:literal, $f:literal, $g:literal, $h:literal) => { + c!( + // Part 0 + "rotri.d $a6, " $e ", 18;" + "rotri.d $a7, " $e ", 41;" + "rotri.d $a4, " $e ", 14;" + "xor $a6, $a6, $a7;" + "xor $a4, $a4, $a6;" + "xor $a6, " $g ", " $f ";" + "ld.d $a7, $a3, " $i " * 8;" + "and $a6, $a6, " $e ";" + "xor $a6, $a6, " $g ";" + "add.d $a4, $a4, $a6;" + "add.d $a4, $a4, $a7;" + "add.d " $h ", " $h ", $a5;" + "add.d " $h ", " $h ", $a4;" + // Part 1 + "add.d " $d ", " $d ", " $h ";" + // Part 2 + "rotri.d $a6, " $a ", 39;" + "rotri.d $a7, " $a ", 34;" + "rotri.d $a4, " $a ", 28;" + "xor $a6, $a6, $a7;" + "xor $a4, $a4, $a6;" + "add.d " $h ", " $h ", $a4;" + "or $a4, " $c ", " $b ";" + "and $a6, " $c ", " $b ";" + "and $a4, $a4, " $a ";" + "or $a4, $a4, $a6;" + "add.d " $h ", " $h ", $a4;" + "st.d $a5, $sp, ((" $i " & 0xF) * 8);" + ) + }; +} + +pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + if blocks.is_empty() { + return; + } + + unsafe { + core::arch::asm!( + // Allocate scratch stack space + "addi.d $sp, $sp, -128;", + + // Load state + "ld.d $t0, $a0, 0", + "ld.d $t1, $a0, 8", + "ld.d $t2, $a0, 16", + "ld.d $t3, $a0, 24", + "ld.d $t4, $a0, 32", + "ld.d $t5, $a0, 40", + "ld.d $t6, $a0, 48", + "ld.d $t7, $a0, 56", + + "42:", + + // Do 64 rounds of hashing + rounda!( 0, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + rounda!( 1, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + rounda!( 2, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + rounda!( 3, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + rounda!( 4, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + rounda!( 5, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + rounda!( 6, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + rounda!( 7, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + rounda!( 8, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + rounda!( 9, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + rounda!(10, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + rounda!(11, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + rounda!(12, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + rounda!(13, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + rounda!(14, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + rounda!(15, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(16, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(17, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(18, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(19, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(20, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(21, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(22, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(23, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(24, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(25, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(26, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(27, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(28, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(29, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(30, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(31, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(32, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(33, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(34, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(35, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(36, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(37, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(38, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(39, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(40, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(41, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(42, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(43, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(44, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(45, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(46, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(47, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(48, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(49, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(50, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(51, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(52, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(53, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(54, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(55, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(56, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(57, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(58, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(59, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(60, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(61, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(62, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(63, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(64, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(65, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(66, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(67, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(68, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(69, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(70, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(71, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + roundb!(72, "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7"), + roundb!(73, "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6"), + roundb!(74, "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4", "$t5"), + roundb!(75, "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3", "$t4"), + roundb!(76, "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2", "$t3"), + roundb!(77, "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1" , "$t2"), + roundb!(78, "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0" , "$t1"), + roundb!(79, "$t1" , "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t0"), + + // Update state registers + "ld.d $a4, $a0, 0", // a + "ld.d $a5, $a0, 8", // b + "ld.d $a6, $a0, 16", // c + "ld.d $a7, $a0, 24", // d + "add.d $t0, $t0, $a4", + "add.d $t1, $t1, $a5", + "add.d $t2, $t2, $a6", + "add.d $t3, $t3, $a7", + "ld.d $a4, $a0, 32", // e + "ld.d $a5, $a0, 40", // f + "ld.d $a6, $a0, 48", // g + "ld.d $a7, $a0, 56", // h + "add.d $t4, $t4, $a4", + "add.d $t5, $t5, $a5", + "add.d $t6, $t6, $a6", + "add.d $t7, $t7, $a7", + + // Save updated state + "st.d $t0, $a0, 0", + "st.d $t1, $a0, 8", + "st.d $t2, $a0, 16", + "st.d $t3, $a0, 24", + "st.d $t4, $a0, 32", + "st.d $t5, $a0, 40", + "st.d $t6, $a0, 48", + "st.d $t7, $a0, 56", + + // Looping over blocks + "addi.d $a1, $a1, 128", + "addi.d $a2, $a2, -1", + "bnez $a2, 42b", + + // Restore stack register + "addi.d $sp, $sp, 128", + + in("$a0") state, + inout("$a1") blocks.as_ptr() => _, + inout("$a2") blocks.len() => _, + in("$a3") crate::consts::K64.as_ptr(), + + // Clobbers + out("$a4") _, + out("$a5") _, + out("$a6") _, + out("$a7") _, + out("$t0") _, + out("$t1") _, + out("$t2") _, + out("$t3") _, + out("$t4") _, + out("$t5") _, + out("$t6") _, + out("$t7") _, + + options(preserves_flags), + ); + } +}