diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index bf54c2f1d3c..ac2d952b965 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt @@ -193,12 +193,12 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR # Set the source directory for s2n-bignum assembly files if(ARCH STREQUAL "x86_64") - set(S2N_BIGNUM_DIR ${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/x86_att) + set(S2N_BIGNUM_DIR ${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/s2n-bignum-imported/x86_att) else() - set(S2N_BIGNUM_DIR ${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/arm) + set(S2N_BIGNUM_DIR ${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/s2n-bignum-imported/arm) endif() - set(S2N_BIGNUM_INCLUDE_DIR ${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/include) + set(S2N_BIGNUM_INCLUDE_DIR ${AWSLC_SOURCE_DIR}/third_party/s2n-bignum/s2n-bignum-imported/include) # We add s2n-bignum files to a separate list because they need # to go through C preprocessor in case of the static build. @@ -290,16 +290,10 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR generic/bignum_optsub.S generic/bignum_sqr.S - fastmul/bignum_kmul_16_32_neon.S - fastmul/bignum_kmul_32_64_neon.S - fastmul/bignum_ksqr_16_32_neon.S - fastmul/bignum_ksqr_32_64_neon.S - fastmul/bignum_emontredc_8n_neon.S - generic/bignum_copy_row_from_table.S - generic/bignum_copy_row_from_table_8n_neon.S - generic/bignum_copy_row_from_table_16_neon.S - generic/bignum_copy_row_from_table_32_neon.S + generic/bignum_copy_row_from_table_8n.S + generic/bignum_copy_row_from_table_16.S + generic/bignum_copy_row_from_table_32.S ) endif() endif() diff --git a/crypto/fipsmodule/bn/exponentiation.c b/crypto/fipsmodule/bn/exponentiation.c index a35658223b1..8713715b037 100644 --- a/crypto/fipsmodule/bn/exponentiation.c +++ b/crypto/fipsmodule/bn/exponentiation.c @@ -124,11 +124,13 @@ defined(OPENSSL_OPENBSD) || defined(OPENSSL_FREEBSD)) && \ defined(OPENSSL_AARCH64) -#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h" +#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h" #define BN_EXPONENTIATION_S2N_BIGNUM_CAPABLE 1 -OPENSSL_INLINE int exponentiation_use_s2n_bignum(void) { return 1; } +OPENSSL_INLINE int exponentiation_use_s2n_bignum(void) { + return CRYPTO_is_NEON_capable(); +} #else @@ -143,17 +145,12 @@ static void exponentiation_s2n_bignum_copy_from_prebuf(BN_ULONG *dest, int width #if defined(BN_EXPONENTIATION_S2N_BIGNUM_CAPABLE) int table_height = 1 << window; - if (CRYPTO_is_NEON_capable()) { - if (width == 32) { - bignum_copy_row_from_table_32_neon(dest, table, table_height, rowidx); - } else if (width == 16) { - bignum_copy_row_from_table_16_neon(dest, table, table_height, rowidx); - } else if (width % 8 == 0) { - bignum_copy_row_from_table_8n_neon(dest, table, table_height, width, - rowidx); - } else { - bignum_copy_row_from_table(dest, table, table_height, width, rowidx); - } + if (width == 32) { + bignum_copy_row_from_table_32(dest, table, table_height, rowidx); + } else if (width == 16) { + bignum_copy_row_from_table_16(dest, table, table_height, rowidx); + } else if (width % 8 == 0) { + bignum_copy_row_from_table_8n(dest, table, table_height, width, rowidx); } else { bignum_copy_row_from_table(dest, table, table_height, width, rowidx); } diff --git a/crypto/fipsmodule/bn/montgomery.c b/crypto/fipsmodule/bn/montgomery.c index 38a651b9bbf..c7ac15c18d6 100644 --- a/crypto/fipsmodule/bn/montgomery.c +++ b/crypto/fipsmodule/bn/montgomery.c @@ -127,7 +127,7 @@ defined(OPENSSL_OPENBSD) || defined(OPENSSL_FREEBSD)) && \ defined(OPENSSL_AARCH64) && defined(OPENSSL_BN_ASM_MONT) -#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h" +#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h" #define BN_MONTGOMERY_S2N_BIGNUM_CAPABLE 1 @@ -137,11 +137,14 @@ OPENSSL_INLINE int montgomery_use_s2n_bignum(unsigned int num) { // (2) num (which is the number of words) is multiplie of 8, because // s2n-bignum's bignum_emontredc_8n requires it, and // (3) The word size is 64 bits. + // (4) CPU has NEON. assert(S2NBIGNUM_KSQR_16_32_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS && S2NBIGNUM_KSQR_32_64_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS && S2NBIGNUM_KMUL_16_32_TEMP_NWORDS <= S2NBIGNUM_KMUL_32_64_TEMP_NWORDS); assert(BN_BITS2 == 64); - return !CRYPTO_is_ARMv8_wide_multiplier_capable() && (num % 8 == 0); + return !CRYPTO_is_ARMv8_wide_multiplier_capable() && + (num % 8 == 0) && + CRYPTO_is_NEON_capable(); } #else @@ -454,7 +457,7 @@ static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a, // are equivalent to the arguments of bn_mul_mont. // montgomery_s2n_bignum_mul_mont works only if num is a multiple of 8. // montgomery_use_s2n_bignum(num) must be called in advance to check this -// condition. +// condition, as well as other s2n-bignum requirements. // For num = 32 or num = 16, this uses faster primitives in s2n-bignum. // montgomery_s2n_bignum_mul_mont allocates S2NBIGNUM_KMUL_32_64_TEMP_NWORDS + // 2 * BN_MONTGOMERY_MAX_WORDS uint64_t words at the stack. @@ -477,34 +480,23 @@ static void montgomery_s2n_bignum_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, uint64_t w = n0[0]; if (num == 32) { - if (CRYPTO_is_NEON_capable()) { - if (ap == bp) - bignum_ksqr_32_64_neon(mulres, ap, t); - else - bignum_kmul_32_64_neon(mulres, ap, bp, t); + if (ap == bp) { + bignum_ksqr_32_64(mulres, ap, t); } else { - if (ap == bp) - bignum_ksqr_32_64(mulres, ap, t); - else - bignum_kmul_32_64(mulres, ap, bp, t); + bignum_kmul_32_64(mulres, ap, bp, t); } } else if (num == 16) { - if (CRYPTO_is_NEON_capable()) { - if (ap == bp) - bignum_ksqr_16_32_neon(mulres, ap, t); - else - bignum_kmul_16_32_neon(mulres, ap, bp, t); + if (ap == bp) { + bignum_ksqr_16_32(mulres, ap, t); } else { - if (ap == bp) - bignum_ksqr_16_32(mulres, ap, t); - else - bignum_kmul_16_32(mulres, ap, bp, t); + bignum_kmul_16_32(mulres, ap, bp, t); } } else { - if (ap == bp) + if (ap == bp) { bignum_sqr(num * 2, mulres, num, ap); - else + } else { bignum_mul(num * 2, mulres, num, ap, num, bp); + } } // Do montgomery reduction. We follow the definition of montgomery reduction @@ -518,9 +510,7 @@ static void montgomery_s2n_bignum_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, // A. The result of step 1 >= 2^(64*num), meaning that bignum_emontredc_8n // returned 1. Since m is less than 2^(64*num), (result of step 1) >= m holds. // B. The result of step 1 fits in 2^(64*num), and the result >= m. - uint64_t c = CRYPTO_is_NEON_capable() ? - bignum_emontredc_8n_neon(num, mulres, np, w) : - bignum_emontredc_8n(num, mulres, np, w); // c: case A + uint64_t c = bignum_emontredc_8n(num, mulres, np, w); // c: case A c |= bignum_ge(num, mulres + num, num, np); // c: case B // Optionally subtract and store the result at rp bignum_optsub(num, rp, mulres + num, c, np); diff --git a/crypto/fipsmodule/curve25519/curve25519_s2n_bignum_asm.c b/crypto/fipsmodule/curve25519/curve25519_s2n_bignum_asm.c index af225f6784f..64a1ed5e703 100644 --- a/crypto/fipsmodule/curve25519/curve25519_s2n_bignum_asm.c +++ b/crypto/fipsmodule/curve25519/curve25519_s2n_bignum_asm.c @@ -5,7 +5,7 @@ #include "../cpucap/internal.h" #if defined(CURVE25519_S2N_BIGNUM_CAPABLE) -#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h" +#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h" void x25519_scalar_mult_generic_s2n_bignum( uint8_t out_shared_key[X25519_SHARED_KEY_LEN], diff --git a/crypto/fipsmodule/ec/p256-nistz.c b/crypto/fipsmodule/ec/p256-nistz.c index 3e8afd6aadb..5b2431f0c2a 100644 --- a/crypto/fipsmodule/ec/p256-nistz.c +++ b/crypto/fipsmodule/ec/p256-nistz.c @@ -34,7 +34,7 @@ #include "ec_nistp.h" #if defined(EC_NISTP_USE_S2N_BIGNUM) -#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h" +#include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h" #endif #if !defined(OPENSSL_NO_ASM) && \ diff --git a/crypto/fipsmodule/ec/p384.c b/crypto/fipsmodule/ec/p384.c index ba5d780a797..581969f34a7 100644 --- a/crypto/fipsmodule/ec/p384.c +++ b/crypto/fipsmodule/ec/p384.c @@ -19,7 +19,7 @@ #if !defined(OPENSSL_SMALL) #if defined(EC_NISTP_USE_S2N_BIGNUM) -# include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h" +# include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h" #else # if defined(EC_NISTP_USE_64BIT_LIMB) # include "../../../third_party/fiat/p384_64.h" diff --git a/crypto/fipsmodule/ec/p521.c b/crypto/fipsmodule/ec/p521.c index db45e518503..ff27c8f7a8b 100644 --- a/crypto/fipsmodule/ec/p521.c +++ b/crypto/fipsmodule/ec/p521.c @@ -22,7 +22,7 @@ #if !defined(OPENSSL_SMALL) #if defined(EC_NISTP_USE_S2N_BIGNUM) -# include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h" +# include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h" #else # if defined(EC_NISTP_USE_64BIT_LIMB) # include "../../../third_party/fiat/p521_64.h" diff --git a/third_party/s2n-bignum/META.yml b/third_party/s2n-bignum/META.yml new file mode 100644 index 00000000000..4949b2bf609 --- /dev/null +++ b/third_party/s2n-bignum/META.yml @@ -0,0 +1,5 @@ +name: s2n-bignum-imported +source: awslabs/s2n-bignum.git +commit: 54e1fa5756d6b13961c2f61d90f75426aa25d373 +target: main +imported-at: 2025-04-28T17:22:07+0000 diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_16_32.S b/third_party/s2n-bignum/arm/fastmul/bignum_kmul_16_32.S deleted file mode 100644 index e45dd487e1f..00000000000 --- a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_16_32.S +++ /dev/null @@ -1,798 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Multiply z := x * y -// Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32] -// -// extern void bignum_kmul_16_32 -// (uint64_t z[static 32], uint64_t x[static 16], uint64_t y[static 16], -// uint64_t t[static 32]) -// -// This is a Karatsuba-style function multiplying half-sized results -// internally and using temporary buffer t for intermediate results. -// -// Standard ARM ABI: X0 = z, X1 = x, X2 = y, X3 = t -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_16_32) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_16_32) - .text - .balign 4 - -// Subroutine-safe copies of the output, inputs and temporary buffer pointers - -#define z x25 -#define x x26 -#define y x27 -#define t x28 - -// More variables for sign masks, with s also necessarily subroutine-safe - -#define s x29 -#define m x19 - -S2N_BN_SYMBOL(bignum_kmul_16_32): - -// Save registers, including return address - - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - stp x25, x26, [sp, #-16]! - stp x27, x28, [sp, #-16]! - stp x29, x30, [sp, #-16]! - -// Move parameters into subroutine-safe places - - mov z, x0 - mov x, x1 - mov y, x2 - mov t, x3 - -// Compute L = x_lo * y_lo in bottom half of buffer (size 8 x 8 -> 16) - - bl bignum_kmul_16_32_local_mul_8_16 - -// Compute absolute difference [t..] = |x_lo - x_hi| -// and the sign s = sgn(x_lo - x_hi) as a bitmask (all 1s for negative) - - ldp x10, x11, [x] - ldp x8, x9, [x, #64] - subs x10, x10, x8 - sbcs x11, x11, x9 - ldp x12, x13, [x, #16] - ldp x8, x9, [x, #80] - sbcs x12, x12, x8 - sbcs x13, x13, x9 - ldp x14, x15, [x, #32] - ldp x8, x9, [x, #96] - sbcs x14, x14, x8 - sbcs x15, x15, x9 - ldp x16, x17, [x, #48] - ldp x8, x9, [x, #112] - sbcs x16, x16, x8 - sbcs x17, x17, x9 - csetm s, cc - adds xzr, s, s - eor x10, x10, s - adcs x10, x10, xzr - eor x11, x11, s - adcs x11, x11, xzr - stp x10, x11, [t] - eor x12, x12, s - adcs x12, x12, xzr - eor x13, x13, s - adcs x13, x13, xzr - stp x12, x13, [t, #16] - eor x14, x14, s - adcs x14, x14, xzr - eor x15, x15, s - adcs x15, x15, xzr - stp x14, x15, [t, #32] - eor x16, x16, s - adcs x16, x16, xzr - eor x17, x17, s - adcs x17, x17, xzr - stp x16, x17, [t, #48] - -// Compute H = x_hi * y_hi in top half of buffer (size 8 x 8 -> 16) - - add x0, z, #128 - add x1, x, #64 - add x2, y, #64 - bl bignum_kmul_16_32_local_mul_8_16 - -// Compute the other absolute difference [t+8..] = |y_hi - y_lo| -// Collect the combined product sign bitmask (all 1s for negative) in s - - ldp x10, x11, [y] - ldp x8, x9, [y, #64] - subs x10, x8, x10 - sbcs x11, x9, x11 - ldp x12, x13, [y, #16] - ldp x8, x9, [y, #80] - sbcs x12, x8, x12 - sbcs x13, x9, x13 - ldp x14, x15, [y, #32] - ldp x8, x9, [y, #96] - sbcs x14, x8, x14 - sbcs x15, x9, x15 - ldp x16, x17, [y, #48] - ldp x8, x9, [y, #112] - sbcs x16, x8, x16 - sbcs x17, x9, x17 - csetm m, cc - adds xzr, m, m - eor x10, x10, m - adcs x10, x10, xzr - eor x11, x11, m - adcs x11, x11, xzr - stp x10, x11, [t, #64] - eor x12, x12, m - adcs x12, x12, xzr - eor x13, x13, m - adcs x13, x13, xzr - stp x12, x13, [t, #80] - eor x14, x14, m - adcs x14, x14, xzr - eor x15, x15, m - adcs x15, x15, xzr - stp x14, x15, [t, #96] - eor x16, x16, m - adcs x16, x16, xzr - eor x17, x17, m - adcs x17, x17, xzr - stp x16, x17, [t, #112] - eor s, s, m - -// Compute H' = H + L_top in place of H (it cannot overflow) -// First add 8-sized block then propagate carry through next 8 - - ldp x10, x11, [z, #128] - ldp x12, x13, [z, #64] - adds x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [z, #128] - - ldp x10, x11, [z, #128+16] - ldp x12, x13, [z, #64+16] - adcs x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [z, #128+16] - - ldp x10, x11, [z, #128+32] - ldp x12, x13, [z, #64+32] - adcs x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [z, #128+32] - - ldp x10, x11, [z, #128+48] - ldp x12, x13, [z, #64+48] - adcs x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [z, #128+48] - - ldp x10, x11, [z, #128+64] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [z, #128+64] - - ldp x10, x11, [z, #128+80] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [z, #128+80] - - ldp x10, x11, [z, #128+96] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [z, #128+96] - - ldp x10, x11, [z, #128+112] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [z, #128+112] - -// Compute M = |x_lo - x_hi| * |y_hi - y_lo| in [t+16...], size 16 - - add x0, t, #128 - mov x1, t - add x2, t, #64 - bl bignum_kmul_16_32_local_mul_8_16 - -// Add the interlocking H' and L_bot terms, storing in registers x15..x0 -// Intercept the carry at the 8 + 16 = 24 position and store it in x. -// (Note that we no longer need the input x was pointing at.) - - ldp x0, x1, [z] - ldp x16, x17, [z, #128] - adds x0, x0, x16 - adcs x1, x1, x17 - ldp x2, x3, [z, #16] - ldp x16, x17, [z, #144] - adcs x2, x2, x16 - adcs x3, x3, x17 - ldp x4, x5, [z, #32] - ldp x16, x17, [z, #160] - adcs x4, x4, x16 - adcs x5, x5, x17 - ldp x6, x7, [z, #48] - ldp x16, x17, [z, #176] - adcs x6, x6, x16 - adcs x7, x7, x17 - ldp x8, x9, [z, #128] - ldp x16, x17, [z, #192] - adcs x8, x8, x16 - adcs x9, x9, x17 - ldp x10, x11, [z, #144] - ldp x16, x17, [z, #208] - adcs x10, x10, x16 - adcs x11, x11, x17 - ldp x12, x13, [z, #160] - ldp x16, x17, [z, #224] - adcs x12, x12, x16 - adcs x13, x13, x17 - ldp x14, x15, [z, #176] - ldp x16, x17, [z, #240] - adcs x14, x14, x16 - adcs x15, x15, x17 - - cset x, cs - -// Add the sign-adjusted mid-term cross product M - - cmn s, s - - ldp x16, x17, [t, #128] - eor x16, x16, s - adcs x0, x0, x16 - eor x17, x17, s - adcs x1, x1, x17 - stp x0, x1, [z, #64] - ldp x16, x17, [t, #144] - eor x16, x16, s - adcs x2, x2, x16 - eor x17, x17, s - adcs x3, x3, x17 - stp x2, x3, [z, #80] - ldp x16, x17, [t, #160] - eor x16, x16, s - adcs x4, x4, x16 - eor x17, x17, s - adcs x5, x5, x17 - stp x4, x5, [z, #96] - ldp x16, x17, [t, #176] - eor x16, x16, s - adcs x6, x6, x16 - eor x17, x17, s - adcs x7, x7, x17 - stp x6, x7, [z, #112] - ldp x16, x17, [t, #192] - eor x16, x16, s - adcs x8, x8, x16 - eor x17, x17, s - adcs x9, x9, x17 - stp x8, x9, [z, #128] - ldp x16, x17, [t, #208] - eor x16, x16, s - adcs x10, x10, x16 - eor x17, x17, s - adcs x11, x11, x17 - stp x10, x11, [z, #144] - ldp x16, x17, [t, #224] - eor x16, x16, s - adcs x12, x12, x16 - eor x17, x17, s - adcs x13, x13, x17 - stp x12, x13, [z, #160] - ldp x16, x17, [t, #240] - eor x16, x16, s - adcs x14, x14, x16 - eor x17, x17, s - adcs x15, x15, x17 - stp x14, x15, [z, #176] - -// Get the next digits effectively resulting so far starting at 24 - - adcs y, s, x - adc t, s, xzr - -// Now the final 8 digits of padding; the first one is special in using y -// and also in getting the carry chain started - - ldp x10, x11, [z, #192] - adds x10, x10, y - adcs x11, x11, t - stp x10, x11, [z, #192] - ldp x10, x11, [z, #208] - adcs x10, x10, t - adcs x11, x11, t - stp x10, x11, [z, #208] - ldp x10, x11, [z, #224] - adcs x10, x10, t - adcs x11, x11, t - stp x10, x11, [z, #224] - ldp x10, x11, [z, #240] - adcs x10, x10, t - adcs x11, x11, t - stp x10, x11, [z, #240] - -// Restore registers and return - - ldp x29, x30, [sp], #16 - ldp x27, x28, [sp], #16 - ldp x25, x26, [sp], #16 - ldp x23, x24, [sp], #16 - ldp x21, x22, [sp], #16 - ldp x19, x20, [sp], #16 - - ret - -// ----------------------------------------------------------------------- -// Local copy of bignum_mul_8_16 without the scratch register save/restore -// ----------------------------------------------------------------------- - -bignum_kmul_16_32_local_mul_8_16: - ldp x3, x4, [x1] - ldp x7, x8, [x2] - ldp x5, x6, [x1, #16] - ldp x9, x10, [x2, #16] - mul x11, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - mul x17, x6, x10 - umulh x19, x3, x7 - adds x15, x15, x19 - umulh x19, x4, x8 - adcs x16, x16, x19 - umulh x19, x5, x9 - adcs x17, x17, x19 - umulh x19, x6, x10 - adc x19, x19, xzr - adds x12, x15, x11 - adcs x15, x16, x15 - adcs x16, x17, x16 - adcs x17, x19, x17 - adc x19, xzr, x19 - adds x13, x15, x11 - adcs x14, x16, x12 - adcs x15, x17, x15 - adcs x16, x19, x16 - adcs x17, xzr, x17 - adc x19, xzr, x19 - subs x24, x5, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x9 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x16, x16, x22 - eor x21, x21, x20 - adcs x17, x17, x21 - adc x19, x19, x20 - subs x24, x3, x4 - cneg x24, x24, cc - csetm x20, cc - subs x21, x8, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x12, x12, x22 - eor x21, x21, x20 - adcs x13, x13, x21 - adcs x14, x14, x20 - adcs x15, x15, x20 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x4, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x8 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x15, x15, x22 - eor x21, x21, x20 - adcs x16, x16, x21 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x3, x5 - cneg x24, x24, cc - csetm x20, cc - subs x21, x9, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x13, x13, x22 - eor x21, x21, x20 - adcs x14, x14, x21 - adcs x15, x15, x20 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x3, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x14, x14, x22 - eor x21, x21, x20 - adcs x15, x15, x21 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x4, x5 - cneg x24, x24, cc - csetm x20, cc - subs x21, x9, x8 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x14, x14, x22 - eor x21, x21, x20 - adcs x15, x15, x21 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - ldp x3, x4, [x1, #32] - stp x11, x12, [x0] - ldp x7, x8, [x2, #32] - stp x13, x14, [x0, #16] - ldp x5, x6, [x1, #48] - stp x15, x16, [x0, #32] - ldp x9, x10, [x2, #48] - stp x17, x19, [x0, #48] - mul x11, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - mul x17, x6, x10 - umulh x19, x3, x7 - adds x15, x15, x19 - umulh x19, x4, x8 - adcs x16, x16, x19 - umulh x19, x5, x9 - adcs x17, x17, x19 - umulh x19, x6, x10 - adc x19, x19, xzr - adds x12, x15, x11 - adcs x15, x16, x15 - adcs x16, x17, x16 - adcs x17, x19, x17 - adc x19, xzr, x19 - adds x13, x15, x11 - adcs x14, x16, x12 - adcs x15, x17, x15 - adcs x16, x19, x16 - adcs x17, xzr, x17 - adc x19, xzr, x19 - ldp x22, x21, [x0, #32] - adds x11, x11, x22 - adcs x12, x12, x21 - ldp x22, x21, [x0, #48] - adcs x13, x13, x22 - adcs x14, x14, x21 - adcs x15, x15, xzr - adcs x16, x16, xzr - adcs x17, x17, xzr - adc x19, x19, xzr - subs x24, x5, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x9 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x16, x16, x22 - eor x21, x21, x20 - adcs x17, x17, x21 - adc x19, x19, x20 - subs x24, x3, x4 - cneg x24, x24, cc - csetm x20, cc - subs x21, x8, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x12, x12, x22 - eor x21, x21, x20 - adcs x13, x13, x21 - adcs x14, x14, x20 - adcs x15, x15, x20 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x4, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x8 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x15, x15, x22 - eor x21, x21, x20 - adcs x16, x16, x21 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x3, x5 - cneg x24, x24, cc - csetm x20, cc - subs x21, x9, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x13, x13, x22 - eor x21, x21, x20 - adcs x14, x14, x21 - adcs x15, x15, x20 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x3, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x14, x14, x22 - eor x21, x21, x20 - adcs x15, x15, x21 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x4, x5 - cneg x24, x24, cc - csetm x20, cc - subs x21, x9, x8 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x14, x14, x22 - eor x21, x21, x20 - adcs x15, x15, x21 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - ldp x22, x21, [x1] - subs x3, x3, x22 - sbcs x4, x4, x21 - ldp x22, x21, [x1, #16] - sbcs x5, x5, x22 - sbcs x6, x6, x21 - csetm x24, cc - stp x11, x12, [x0, #64] - ldp x22, x21, [x2] - subs x7, x22, x7 - sbcs x8, x21, x8 - ldp x22, x21, [x2, #16] - sbcs x9, x22, x9 - sbcs x10, x21, x10 - csetm x1, cc - stp x13, x14, [x0, #80] - eor x3, x3, x24 - subs x3, x3, x24 - eor x4, x4, x24 - sbcs x4, x4, x24 - eor x5, x5, x24 - sbcs x5, x5, x24 - eor x6, x6, x24 - sbc x6, x6, x24 - stp x15, x16, [x0, #96] - eor x7, x7, x1 - subs x7, x7, x1 - eor x8, x8, x1 - sbcs x8, x8, x1 - eor x9, x9, x1 - sbcs x9, x9, x1 - eor x10, x10, x1 - sbc x10, x10, x1 - stp x17, x19, [x0, #112] - eor x1, x1, x24 - mul x11, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - mul x17, x6, x10 - umulh x19, x3, x7 - adds x15, x15, x19 - umulh x19, x4, x8 - adcs x16, x16, x19 - umulh x19, x5, x9 - adcs x17, x17, x19 - umulh x19, x6, x10 - adc x19, x19, xzr - adds x12, x15, x11 - adcs x15, x16, x15 - adcs x16, x17, x16 - adcs x17, x19, x17 - adc x19, xzr, x19 - adds x13, x15, x11 - adcs x14, x16, x12 - adcs x15, x17, x15 - adcs x16, x19, x16 - adcs x17, xzr, x17 - adc x19, xzr, x19 - subs x24, x5, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x9 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x16, x16, x22 - eor x21, x21, x20 - adcs x17, x17, x21 - adc x19, x19, x20 - subs x24, x3, x4 - cneg x24, x24, cc - csetm x20, cc - subs x21, x8, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x12, x12, x22 - eor x21, x21, x20 - adcs x13, x13, x21 - adcs x14, x14, x20 - adcs x15, x15, x20 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x4, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x8 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x15, x15, x22 - eor x21, x21, x20 - adcs x16, x16, x21 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x3, x5 - cneg x24, x24, cc - csetm x20, cc - subs x21, x9, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x13, x13, x22 - eor x21, x21, x20 - adcs x14, x14, x21 - adcs x15, x15, x20 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x3, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x14, x14, x22 - eor x21, x21, x20 - adcs x15, x15, x21 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x4, x5 - cneg x24, x24, cc - csetm x20, cc - subs x21, x9, x8 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x14, x14, x22 - eor x21, x21, x20 - adcs x15, x15, x21 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - ldp x3, x4, [x0] - ldp x7, x8, [x0, #64] - adds x3, x3, x7 - adcs x4, x4, x8 - ldp x5, x6, [x0, #16] - ldp x9, x10, [x0, #80] - adcs x5, x5, x9 - adcs x6, x6, x10 - ldp x20, x21, [x0, #96] - adcs x7, x7, x20 - adcs x8, x8, x21 - ldp x22, x23, [x0, #112] - adcs x9, x9, x22 - adcs x10, x10, x23 - adcs x24, x1, xzr - adc x2, x1, xzr - cmn x1, #0x1 - eor x11, x11, x1 - adcs x3, x11, x3 - eor x12, x12, x1 - adcs x4, x12, x4 - eor x13, x13, x1 - adcs x5, x13, x5 - eor x14, x14, x1 - adcs x6, x14, x6 - eor x15, x15, x1 - adcs x7, x15, x7 - eor x16, x16, x1 - adcs x8, x16, x8 - eor x17, x17, x1 - adcs x9, x17, x9 - eor x19, x19, x1 - adcs x10, x19, x10 - adcs x20, x20, x24 - adcs x21, x21, x2 - adcs x22, x22, x2 - adc x23, x23, x2 - stp x3, x4, [x0, #32] - stp x5, x6, [x0, #48] - stp x7, x8, [x0, #64] - stp x9, x10, [x0, #80] - stp x20, x21, [x0, #96] - stp x22, x23, [x0, #112] - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_32_64.S b/third_party/s2n-bignum/arm/fastmul/bignum_kmul_32_64.S deleted file mode 100644 index e45249462ac..00000000000 --- a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_32_64.S +++ /dev/null @@ -1,1348 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Multiply z := x * y -// Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96] -// -// extern void bignum_kmul_32_64 -// (uint64_t z[static 64], uint64_t x[static 32], uint64_t y[static 32], -// uint64_t t[static 96]) -// -// This is a Karatsuba-style function multiplying half-sized results -// internally and using temporary buffer t for intermediate results. -// -// Standard ARM ABI: X0 = z, X1 = x, X2 = y, X3 = t -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_32_64) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_32_64) - .text - .balign 4 - -#define K 16 -#define L 8 // this is (K/2) - -#define z x19 -#define x x20 -#define y x21 -#define t x22 - -#define c x16 - -S2N_BN_SYMBOL(bignum_kmul_32_64): - -// Save extra registers and return address, store parameters safely - - stp x19, x20, [sp, -16]! - stp x21, x22, [sp, -16]! - stp x23, x24, [sp, -16]! - stp x25, x26, [sp, -16]! - stp x27, x28, [sp, -16]! - stp x29, x30, [sp, -16]! - - mov z, x0 - mov x, x1 - mov y, x2 - mov t, x3 - -// Compute L = x_lo * y_lo in bottom half of buffer (size 16 x 16 -> 32) - - bl bignum_kmul_32_64_local_kmul_16_32 - -// Compute H = x_hi * y_hi in top half of buffer (size 16 x 16 -> 32) - - add x0, z, #16*K - add x1, x, #8*K - add x2, y, #8*K - mov x3, t - bl bignum_kmul_32_64_local_kmul_16_32 - -// Compute absolute difference [t..] = |x_lo - x_hi| -// and the sign x = sgn(x_lo - x_hi) as a bitmask (all 1s for negative) -// Note that we overwrite the pointer x itself with this sign, -// which is safe since we no longer need it. - - ldp x0, x1, [x, #128] - ldp x16, x17, [x] - subs x0, x0, x16 - sbcs x1, x1, x17 - - ldp x2, x3, [x, #144] - ldp x16, x17, [x, #16] - sbcs x2, x2, x16 - sbcs x3, x3, x17 - - ldp x4, x5, [x, #160] - ldp x16, x17, [x, #32] - sbcs x4, x4, x16 - sbcs x5, x5, x17 - - ldp x6, x7, [x, #176] - ldp x16, x17, [x, #48] - sbcs x6, x6, x16 - sbcs x7, x7, x17 - - ldp x8, x9, [x, #192] - ldp x16, x17, [x, #64] - sbcs x8, x8, x16 - sbcs x9, x9, x17 - - ldp x10, x11, [x, #208] - ldp x16, x17, [x, #80] - sbcs x10, x10, x16 - sbcs x11, x11, x17 - - ldp x12, x13, [x, #224] - ldp x16, x17, [x, #96] - sbcs x12, x12, x16 - sbcs x13, x13, x17 - - ldp x14, x15, [x, #240] - ldp x16, x17, [x, #112] - sbcs x14, x14, x16 - sbcs x15, x15, x17 - - sbc x, xzr, xzr - - adds xzr, x, x - - eor x0, x0, x - adcs x0, x0, xzr - eor x1, x1, x - adcs x1, x1, xzr - stp x0, x1, [t] - - eor x2, x2, x - adcs x2, x2, xzr - eor x3, x3, x - adcs x3, x3, xzr - stp x2, x3, [t, #16] - - eor x4, x4, x - adcs x4, x4, xzr - eor x5, x5, x - adcs x5, x5, xzr - stp x4, x5, [t, #32] - - eor x6, x6, x - adcs x6, x6, xzr - eor x7, x7, x - adcs x7, x7, xzr - stp x6, x7, [t, #48] - - eor x8, x8, x - adcs x8, x8, xzr - eor x9, x9, x - adcs x9, x9, xzr - stp x8, x9, [t, #64] - - eor x10, x10, x - adcs x10, x10, xzr - eor x11, x11, x - adcs x11, x11, xzr - stp x10, x11, [t, #80] - - eor x12, x12, x - adcs x12, x12, xzr - eor x13, x13, x - adcs x13, x13, xzr - stp x12, x13, [t, #96] - - eor x14, x14, x - adcs x14, x14, xzr - eor x15, x15, x - adc x15, x15, xzr - stp x14, x15, [t, #112] - -// Compute the other absolute difference [t+8*K..] = |y_hi - y_lo| -// Collect the combined product sign bitmask (all 1s for negative) as -// y = sgn((x_lo - x_hi) * (y_hi - y_lo)), overwriting the y pointer. - - ldp x0, x1, [y] - ldp x16, x17, [y, #128] - subs x0, x0, x16 - sbcs x1, x1, x17 - - ldp x2, x3, [y, #16] - ldp x16, x17, [y, #144] - sbcs x2, x2, x16 - sbcs x3, x3, x17 - - ldp x4, x5, [y, #32] - ldp x16, x17, [y, #160] - sbcs x4, x4, x16 - sbcs x5, x5, x17 - - ldp x6, x7, [y, #48] - ldp x16, x17, [y, #176] - sbcs x6, x6, x16 - sbcs x7, x7, x17 - - ldp x8, x9, [y, #64] - ldp x16, x17, [y, #192] - sbcs x8, x8, x16 - sbcs x9, x9, x17 - - ldp x10, x11, [y, #80] - ldp x16, x17, [y, #208] - sbcs x10, x10, x16 - sbcs x11, x11, x17 - - ldp x12, x13, [y, #96] - ldp x16, x17, [y, #224] - sbcs x12, x12, x16 - sbcs x13, x13, x17 - - ldp x14, x15, [y, #112] - ldp x16, x17, [y, #240] - sbcs x14, x14, x16 - sbcs x15, x15, x17 - - sbc y, xzr, xzr - - adds xzr, y, y - - eor x0, x0, y - adcs x0, x0, xzr - eor x1, x1, y - adcs x1, x1, xzr - stp x0, x1, [t, #128] - - eor x2, x2, y - adcs x2, x2, xzr - eor x3, x3, y - adcs x3, x3, xzr - stp x2, x3, [t, #128+16] - - eor x4, x4, y - adcs x4, x4, xzr - eor x5, x5, y - adcs x5, x5, xzr - stp x4, x5, [t, #128+32] - - eor x6, x6, y - adcs x6, x6, xzr - eor x7, x7, y - adcs x7, x7, xzr - stp x6, x7, [t, #128+48] - - eor x8, x8, y - adcs x8, x8, xzr - eor x9, x9, y - adcs x9, x9, xzr - stp x8, x9, [t, #128+64] - - eor x10, x10, y - adcs x10, x10, xzr - eor x11, x11, y - adcs x11, x11, xzr - stp x10, x11, [t, #128+80] - - eor x12, x12, y - adcs x12, x12, xzr - eor x13, x13, y - adcs x13, x13, xzr - stp x12, x13, [t, #128+96] - - eor x14, x14, y - adcs x14, x14, xzr - eor x15, x15, y - adc x15, x15, xzr - stp x14, x15, [t, #128+112] - - eor y, y, x - -// Compute H' = H + L_top in place of H (it cannot overflow) - - ldp x0, x1, [z, #16*16] - ldp x2, x3, [z, #16*L] - adds x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*16] - - ldp x0, x1, [z, #16*17] - ldp x2, x3, [z, #16*9] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*17] - - ldp x0, x1, [z, #16*18] - ldp x2, x3, [z, #16*10] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*18] - - ldp x0, x1, [z, #16*19] - ldp x2, x3, [z, #16*11] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*19] - - ldp x0, x1, [z, #16*20] - ldp x2, x3, [z, #16*12] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*20] - - ldp x0, x1, [z, #16*21] - ldp x2, x3, [z, #16*13] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*21] - - ldp x0, x1, [z, #16*22] - ldp x2, x3, [z, #16*14] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*22] - - ldp x0, x1, [z, #16*23] - ldp x2, x3, [z, #16*15] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*23] - - ldp x0, x1, [z, #16*24] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*24] - - ldp x0, x1, [z, #16*25] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*25] - - ldp x0, x1, [z, #16*26] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*26] - - ldp x0, x1, [z, #16*27] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*27] - - ldp x0, x1, [z, #16*28] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*28] - - ldp x0, x1, [z, #16*29] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*29] - - ldp x0, x1, [z, #16*30] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*30] - - ldp x0, x1, [z, #16*31] - adcs x0, x0, xzr - adc x1, x1, xzr - stp x0, x1, [z, #16*31] - -// Compute M = |x_lo - x_hi| * |y_hi - y_lo|, size 32 - - add x0, t, #16*K - mov x1, t - add x2, t, #8*K - add x3, t, #32*K - bl bignum_kmul_32_64_local_kmul_16_32 - -// Add the interlocking H' and L_bot terms -// Intercept the carry at the 3k position and store it in x. -// Again, we no longer need the input x was pointing at. - - ldp x0, x1, [z, #16*16] - ldp x2, x3, [z] - adds x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*8] - - ldp x0, x1, [z, #16*17] - ldp x2, x3, [z, #16*1] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*9] - - ldp x0, x1, [z, #16*18] - ldp x2, x3, [z, #16*2] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*10] - - ldp x0, x1, [z, #16*19] - ldp x2, x3, [z, #16*3] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*11] - - ldp x0, x1, [z, #16*20] - ldp x2, x3, [z, #16*4] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*12] - - ldp x0, x1, [z, #16*21] - ldp x2, x3, [z, #16*5] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*13] - - ldp x0, x1, [z, #16*22] - ldp x2, x3, [z, #16*6] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*14] - - ldp x0, x1, [z, #16*23] - ldp x2, x3, [z, #16*7] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*15] - - ldp x0, x1, [z, #16*16] - ldp x2, x3, [z, #16*24] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*16] - - ldp x0, x1, [z, #16*17] - ldp x2, x3, [z, #16*25] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*17] - - ldp x0, x1, [z, #16*18] - ldp x2, x3, [z, #16*26] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*18] - - ldp x0, x1, [z, #16*19] - ldp x2, x3, [z, #16*27] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*19] - - ldp x0, x1, [z, #16*20] - ldp x2, x3, [z, #16*28] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*20] - - ldp x0, x1, [z, #16*21] - ldp x2, x3, [z, #16*29] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*21] - - ldp x0, x1, [z, #16*22] - ldp x2, x3, [z, #16*30] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*22] - - ldp x0, x1, [z, #16*23] - ldp x2, x3, [z, #16*31] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*23] - - cset x, cs - -// Add the sign-adjusted mid-term cross product M - - cmn y, y - - ldp x0, x1, [z, #128] - ldp x2, x3, [t, #128+128] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #128] - - ldp x0, x1, [z, #144] - ldp x2, x3, [t, #128+144] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #144] - - ldp x0, x1, [z, #160] - ldp x2, x3, [t, #128+160] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #160] - - ldp x0, x1, [z, #176] - ldp x2, x3, [t, #128+176] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #176] - - ldp x0, x1, [z, #192] - ldp x2, x3, [t, #128+192] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #192] - - ldp x0, x1, [z, #208] - ldp x2, x3, [t, #128+208] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #208] - - ldp x0, x1, [z, #224] - ldp x2, x3, [t, #128+224] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #224] - - ldp x0, x1, [z, #240] - ldp x2, x3, [t, #128+240] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #240] - - ldp x0, x1, [z, #256] - ldp x2, x3, [t, #128+256] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #256] - - ldp x0, x1, [z, #272] - ldp x2, x3, [t, #128+272] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #272] - - ldp x0, x1, [z, #288] - ldp x2, x3, [t, #128+288] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #288] - - ldp x0, x1, [z, #304] - ldp x2, x3, [t, #128+304] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #304] - - ldp x0, x1, [z, #320] - ldp x2, x3, [t, #128+320] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #320] - - ldp x0, x1, [z, #336] - ldp x2, x3, [t, #128+336] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #336] - - ldp x0, x1, [z, #352] - ldp x2, x3, [t, #128+352] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #352] - - ldp x0, x1, [z, #368] - ldp x2, x3, [t, #128+368] - eor x2, x2, y - adcs x0, x0, x2 - eor x3, x3, y - adcs x1, x1, x3 - stp x0, x1, [z, #368] - -// Get the next digits effectively resulting so far starting at 3k -// [...,c,c,c,c,x] - - adcs x, y, x - adc c, y, xzr - -// Now propagate through the top quarter of the result - - ldp x0, x1, [z, #16*24] - adds x0, x0, x - adcs x1, x1, c - stp x0, x1, [z, #16*24] - - ldp x0, x1, [z, #16*25] - adcs x0, x0, c - adcs x1, x1, c - stp x0, x1, [z, #16*25] - - ldp x0, x1, [z, #16*26] - adcs x0, x0, c - adcs x1, x1, c - stp x0, x1, [z, #16*26] - - ldp x0, x1, [z, #16*27] - adcs x0, x0, c - adcs x1, x1, c - stp x0, x1, [z, #16*27] - - ldp x0, x1, [z, #16*28] - adcs x0, x0, c - adcs x1, x1, c - stp x0, x1, [z, #16*28] - - ldp x0, x1, [z, #16*29] - adcs x0, x0, c - adcs x1, x1, c - stp x0, x1, [z, #16*29] - - ldp x0, x1, [z, #16*30] - adcs x0, x0, c - adcs x1, x1, c - stp x0, x1, [z, #16*30] - - ldp x0, x1, [z, #16*31] - adcs x0, x0, c - adc x1, x1, c - stp x0, x1, [z, #16*31] - -// Restore and return - - ldp x29, x30, [sp], #16 - ldp x27, x28, [sp], #16 - ldp x25, x26, [sp], #16 - ldp x23, x24, [sp], #16 - ldp x21, x22, [sp], #16 - ldp x19, x20, [sp], #16 - ret - -// Local copy of bignum_kmul_16_32, identical to main one except that it -// only preserves the key registers we need to be stable in the main code. -// This includes in turn a copy of bignum_mul_8_16. - -bignum_kmul_32_64_local_kmul_16_32: - stp x19, x20, [sp, -16]! - stp x21, x22, [sp, -16]! - stp x23, x30, [sp, -16]! - mov x25, x0 - mov x26, x1 - mov x27, x2 - mov x28, x3 - bl bignum_kmul_32_64_local_mul_8_16 - ldp x10, x11, [x26] - ldp x8, x9, [x26, #64] - subs x10, x10, x8 - sbcs x11, x11, x9 - ldp x12, x13, [x26, #16] - ldp x8, x9, [x26, #80] - sbcs x12, x12, x8 - sbcs x13, x13, x9 - ldp x14, x15, [x26, #32] - ldp x8, x9, [x26, #96] - sbcs x14, x14, x8 - sbcs x15, x15, x9 - ldp x16, x17, [x26, #48] - ldp x8, x9, [x26, #112] - sbcs x16, x16, x8 - sbcs x17, x17, x9 - csetm x29, cc - cmn x29, x29 - eor x10, x10, x29 - adcs x10, x10, xzr - eor x11, x11, x29 - adcs x11, x11, xzr - stp x10, x11, [x28] - eor x12, x12, x29 - adcs x12, x12, xzr - eor x13, x13, x29 - adcs x13, x13, xzr - stp x12, x13, [x28, #16] - eor x14, x14, x29 - adcs x14, x14, xzr - eor x15, x15, x29 - adcs x15, x15, xzr - stp x14, x15, [x28, #32] - eor x16, x16, x29 - adcs x16, x16, xzr - eor x17, x17, x29 - adcs x17, x17, xzr - stp x16, x17, [x28, #48] - add x0, x25, #0x80 - add x1, x26, #0x40 - add x2, x27, #0x40 - bl bignum_kmul_32_64_local_mul_8_16 - ldp x10, x11, [x27] - ldp x8, x9, [x27, #64] - subs x10, x8, x10 - sbcs x11, x9, x11 - ldp x12, x13, [x27, #16] - ldp x8, x9, [x27, #80] - sbcs x12, x8, x12 - sbcs x13, x9, x13 - ldp x14, x15, [x27, #32] - ldp x8, x9, [x27, #96] - sbcs x14, x8, x14 - sbcs x15, x9, x15 - ldp x16, x17, [x27, #48] - ldp x8, x9, [x27, #112] - sbcs x16, x8, x16 - sbcs x17, x9, x17 - csetm x19, cc - cmn x19, x19 - eor x10, x10, x19 - adcs x10, x10, xzr - eor x11, x11, x19 - adcs x11, x11, xzr - stp x10, x11, [x28, #64] - eor x12, x12, x19 - adcs x12, x12, xzr - eor x13, x13, x19 - adcs x13, x13, xzr - stp x12, x13, [x28, #80] - eor x14, x14, x19 - adcs x14, x14, xzr - eor x15, x15, x19 - adcs x15, x15, xzr - stp x14, x15, [x28, #96] - eor x16, x16, x19 - adcs x16, x16, xzr - eor x17, x17, x19 - adcs x17, x17, xzr - stp x16, x17, [x28, #112] - eor x29, x29, x19 - ldp x10, x11, [x25, #128] - ldp x12, x13, [x25, #64] - adds x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [x25, #128] - ldp x10, x11, [x25, #144] - ldp x12, x13, [x25, #80] - adcs x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [x25, #144] - ldp x10, x11, [x25, #160] - ldp x12, x13, [x25, #96] - adcs x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [x25, #160] - ldp x10, x11, [x25, #176] - ldp x12, x13, [x25, #112] - adcs x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [x25, #176] - ldp x10, x11, [x25, #192] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [x25, #192] - ldp x10, x11, [x25, #208] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [x25, #208] - ldp x10, x11, [x25, #224] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [x25, #224] - ldp x10, x11, [x25, #240] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [x25, #240] - add x0, x28, #0x80 - mov x1, x28 - add x2, x28, #0x40 - bl bignum_kmul_32_64_local_mul_8_16 - ldp x0, x1, [x25] - ldp x16, x17, [x25, #128] - adds x0, x0, x16 - adcs x1, x1, x17 - ldp x2, x3, [x25, #16] - ldp x16, x17, [x25, #144] - adcs x2, x2, x16 - adcs x3, x3, x17 - ldp x4, x5, [x25, #32] - ldp x16, x17, [x25, #160] - adcs x4, x4, x16 - adcs x5, x5, x17 - ldp x6, x7, [x25, #48] - ldp x16, x17, [x25, #176] - adcs x6, x6, x16 - adcs x7, x7, x17 - ldp x8, x9, [x25, #128] - ldp x16, x17, [x25, #192] - adcs x8, x8, x16 - adcs x9, x9, x17 - ldp x10, x11, [x25, #144] - ldp x16, x17, [x25, #208] - adcs x10, x10, x16 - adcs x11, x11, x17 - ldp x12, x13, [x25, #160] - ldp x16, x17, [x25, #224] - adcs x12, x12, x16 - adcs x13, x13, x17 - ldp x14, x15, [x25, #176] - ldp x16, x17, [x25, #240] - adcs x14, x14, x16 - adcs x15, x15, x17 - cset x26, cs - cmn x29, x29 - ldp x16, x17, [x28, #128] - eor x16, x16, x29 - adcs x0, x0, x16 - eor x17, x17, x29 - adcs x1, x1, x17 - stp x0, x1, [x25, #64] - ldp x16, x17, [x28, #144] - eor x16, x16, x29 - adcs x2, x2, x16 - eor x17, x17, x29 - adcs x3, x3, x17 - stp x2, x3, [x25, #80] - ldp x16, x17, [x28, #160] - eor x16, x16, x29 - adcs x4, x4, x16 - eor x17, x17, x29 - adcs x5, x5, x17 - stp x4, x5, [x25, #96] - ldp x16, x17, [x28, #176] - eor x16, x16, x29 - adcs x6, x6, x16 - eor x17, x17, x29 - adcs x7, x7, x17 - stp x6, x7, [x25, #112] - ldp x16, x17, [x28, #192] - eor x16, x16, x29 - adcs x8, x8, x16 - eor x17, x17, x29 - adcs x9, x9, x17 - stp x8, x9, [x25, #128] - ldp x16, x17, [x28, #208] - eor x16, x16, x29 - adcs x10, x10, x16 - eor x17, x17, x29 - adcs x11, x11, x17 - stp x10, x11, [x25, #144] - ldp x16, x17, [x28, #224] - eor x16, x16, x29 - adcs x12, x12, x16 - eor x17, x17, x29 - adcs x13, x13, x17 - stp x12, x13, [x25, #160] - ldp x16, x17, [x28, #240] - eor x16, x16, x29 - adcs x14, x14, x16 - eor x17, x17, x29 - adcs x15, x15, x17 - stp x14, x15, [x25, #176] - adcs x27, x29, x26 - adc x28, x29, xzr - ldp x10, x11, [x25, #192] - adds x10, x10, x27 - adcs x11, x11, x28 - stp x10, x11, [x25, #192] - ldp x10, x11, [x25, #208] - adcs x10, x10, x28 - adcs x11, x11, x28 - stp x10, x11, [x25, #208] - ldp x10, x11, [x25, #224] - adcs x10, x10, x28 - adcs x11, x11, x28 - stp x10, x11, [x25, #224] - ldp x10, x11, [x25, #240] - adcs x10, x10, x28 - adcs x11, x11, x28 - stp x10, x11, [x25, #240] - ldp x23, x30, [sp], #16 - ldp x21, x22, [sp], #16 - ldp x19, x20, [sp], #16 - ret - -bignum_kmul_32_64_local_mul_8_16: - ldp x3, x4, [x1] - ldp x7, x8, [x2] - ldp x5, x6, [x1, #16] - ldp x9, x10, [x2, #16] - mul x11, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - mul x17, x6, x10 - umulh x19, x3, x7 - adds x15, x15, x19 - umulh x19, x4, x8 - adcs x16, x16, x19 - umulh x19, x5, x9 - adcs x17, x17, x19 - umulh x19, x6, x10 - adc x19, x19, xzr - adds x12, x15, x11 - adcs x15, x16, x15 - adcs x16, x17, x16 - adcs x17, x19, x17 - adc x19, xzr, x19 - adds x13, x15, x11 - adcs x14, x16, x12 - adcs x15, x17, x15 - adcs x16, x19, x16 - adcs x17, xzr, x17 - adc x19, xzr, x19 - subs x24, x5, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x9 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x16, x16, x22 - eor x21, x21, x20 - adcs x17, x17, x21 - adc x19, x19, x20 - subs x24, x3, x4 - cneg x24, x24, cc - csetm x20, cc - subs x21, x8, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x12, x12, x22 - eor x21, x21, x20 - adcs x13, x13, x21 - adcs x14, x14, x20 - adcs x15, x15, x20 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x4, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x8 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x15, x15, x22 - eor x21, x21, x20 - adcs x16, x16, x21 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x3, x5 - cneg x24, x24, cc - csetm x20, cc - subs x21, x9, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x13, x13, x22 - eor x21, x21, x20 - adcs x14, x14, x21 - adcs x15, x15, x20 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x3, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x14, x14, x22 - eor x21, x21, x20 - adcs x15, x15, x21 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x4, x5 - cneg x24, x24, cc - csetm x20, cc - subs x21, x9, x8 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x14, x14, x22 - eor x21, x21, x20 - adcs x15, x15, x21 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - ldp x3, x4, [x1, #32] - stp x11, x12, [x0] - ldp x7, x8, [x2, #32] - stp x13, x14, [x0, #16] - ldp x5, x6, [x1, #48] - stp x15, x16, [x0, #32] - ldp x9, x10, [x2, #48] - stp x17, x19, [x0, #48] - mul x11, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - mul x17, x6, x10 - umulh x19, x3, x7 - adds x15, x15, x19 - umulh x19, x4, x8 - adcs x16, x16, x19 - umulh x19, x5, x9 - adcs x17, x17, x19 - umulh x19, x6, x10 - adc x19, x19, xzr - adds x12, x15, x11 - adcs x15, x16, x15 - adcs x16, x17, x16 - adcs x17, x19, x17 - adc x19, xzr, x19 - adds x13, x15, x11 - adcs x14, x16, x12 - adcs x15, x17, x15 - adcs x16, x19, x16 - adcs x17, xzr, x17 - adc x19, xzr, x19 - ldp x22, x21, [x0, #32] - adds x11, x11, x22 - adcs x12, x12, x21 - ldp x22, x21, [x0, #48] - adcs x13, x13, x22 - adcs x14, x14, x21 - adcs x15, x15, xzr - adcs x16, x16, xzr - adcs x17, x17, xzr - adc x19, x19, xzr - subs x24, x5, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x9 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x16, x16, x22 - eor x21, x21, x20 - adcs x17, x17, x21 - adc x19, x19, x20 - subs x24, x3, x4 - cneg x24, x24, cc - csetm x20, cc - subs x21, x8, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x12, x12, x22 - eor x21, x21, x20 - adcs x13, x13, x21 - adcs x14, x14, x20 - adcs x15, x15, x20 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x4, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x8 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x15, x15, x22 - eor x21, x21, x20 - adcs x16, x16, x21 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x3, x5 - cneg x24, x24, cc - csetm x20, cc - subs x21, x9, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x13, x13, x22 - eor x21, x21, x20 - adcs x14, x14, x21 - adcs x15, x15, x20 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x3, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x14, x14, x22 - eor x21, x21, x20 - adcs x15, x15, x21 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x4, x5 - cneg x24, x24, cc - csetm x20, cc - subs x21, x9, x8 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x14, x14, x22 - eor x21, x21, x20 - adcs x15, x15, x21 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - ldp x22, x21, [x1] - subs x3, x3, x22 - sbcs x4, x4, x21 - ldp x22, x21, [x1, #16] - sbcs x5, x5, x22 - sbcs x6, x6, x21 - csetm x24, cc - stp x11, x12, [x0, #64] - ldp x22, x21, [x2] - subs x7, x22, x7 - sbcs x8, x21, x8 - ldp x22, x21, [x2, #16] - sbcs x9, x22, x9 - sbcs x10, x21, x10 - csetm x1, cc - stp x13, x14, [x0, #80] - eor x3, x3, x24 - subs x3, x3, x24 - eor x4, x4, x24 - sbcs x4, x4, x24 - eor x5, x5, x24 - sbcs x5, x5, x24 - eor x6, x6, x24 - sbc x6, x6, x24 - stp x15, x16, [x0, #96] - eor x7, x7, x1 - subs x7, x7, x1 - eor x8, x8, x1 - sbcs x8, x8, x1 - eor x9, x9, x1 - sbcs x9, x9, x1 - eor x10, x10, x1 - sbc x10, x10, x1 - stp x17, x19, [x0, #112] - eor x1, x1, x24 - mul x11, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - mul x17, x6, x10 - umulh x19, x3, x7 - adds x15, x15, x19 - umulh x19, x4, x8 - adcs x16, x16, x19 - umulh x19, x5, x9 - adcs x17, x17, x19 - umulh x19, x6, x10 - adc x19, x19, xzr - adds x12, x15, x11 - adcs x15, x16, x15 - adcs x16, x17, x16 - adcs x17, x19, x17 - adc x19, xzr, x19 - adds x13, x15, x11 - adcs x14, x16, x12 - adcs x15, x17, x15 - adcs x16, x19, x16 - adcs x17, xzr, x17 - adc x19, xzr, x19 - subs x24, x5, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x9 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x16, x16, x22 - eor x21, x21, x20 - adcs x17, x17, x21 - adc x19, x19, x20 - subs x24, x3, x4 - cneg x24, x24, cc - csetm x20, cc - subs x21, x8, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x12, x12, x22 - eor x21, x21, x20 - adcs x13, x13, x21 - adcs x14, x14, x20 - adcs x15, x15, x20 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x4, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x8 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x15, x15, x22 - eor x21, x21, x20 - adcs x16, x16, x21 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x3, x5 - cneg x24, x24, cc - csetm x20, cc - subs x21, x9, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x13, x13, x22 - eor x21, x21, x20 - adcs x14, x14, x21 - adcs x15, x15, x20 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x3, x6 - cneg x24, x24, cc - csetm x20, cc - subs x21, x10, x7 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x14, x14, x22 - eor x21, x21, x20 - adcs x15, x15, x21 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - subs x24, x4, x5 - cneg x24, x24, cc - csetm x20, cc - subs x21, x9, x8 - cneg x21, x21, cc - mul x22, x24, x21 - umulh x21, x24, x21 - cinv x20, x20, cc - cmn x20, #0x1 - eor x22, x22, x20 - adcs x14, x14, x22 - eor x21, x21, x20 - adcs x15, x15, x21 - adcs x16, x16, x20 - adcs x17, x17, x20 - adc x19, x19, x20 - ldp x3, x4, [x0] - ldp x7, x8, [x0, #64] - adds x3, x3, x7 - adcs x4, x4, x8 - ldp x5, x6, [x0, #16] - ldp x9, x10, [x0, #80] - adcs x5, x5, x9 - adcs x6, x6, x10 - ldp x20, x21, [x0, #96] - adcs x7, x7, x20 - adcs x8, x8, x21 - ldp x22, x23, [x0, #112] - adcs x9, x9, x22 - adcs x10, x10, x23 - adcs x24, x1, xzr - adc x2, x1, xzr - cmn x1, #0x1 - eor x11, x11, x1 - adcs x3, x11, x3 - eor x12, x12, x1 - adcs x4, x12, x4 - eor x13, x13, x1 - adcs x5, x13, x5 - eor x14, x14, x1 - adcs x6, x14, x6 - eor x15, x15, x1 - adcs x7, x15, x7 - eor x16, x16, x1 - adcs x8, x16, x8 - eor x17, x17, x1 - adcs x9, x17, x9 - eor x19, x19, x1 - adcs x10, x19, x10 - adcs x20, x20, x24 - adcs x21, x21, x2 - adcs x22, x22, x2 - adc x23, x23, x2 - stp x3, x4, [x0, #32] - stp x5, x6, [x0, #48] - stp x7, x8, [x0, #64] - stp x9, x10, [x0, #80] - stp x20, x21, [x0, #96] - stp x22, x23, [x0, #112] - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_16_32_neon.S b/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_16_32_neon.S deleted file mode 100644 index 6be2bcb3846..00000000000 --- a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_16_32_neon.S +++ /dev/null @@ -1,658 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Square, z := x^2 -// Input x[16]; output z[32]; temporary buffer t[>=24] -// -// extern void bignum_ksqr_16_32_neon -// (uint64_t z[static 32], uint64_t x[static 16], uint64_t t[static 24]); -// -// This is a Karatsuba-style function squaring half-sized results -// and using temporary buffer t for intermediate results. -// -// Standard ARM ABI: X0 = z, X1 = x, X2 = t -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ksqr_16_32_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ksqr_16_32_neon) - .text - .balign 4 - -// Subroutine-safe copies of the output, inputs and temporary buffer pointers - -#define z x23 -#define x x24 -#define t x25 - -// More variables for sign masks, with s also necessarily subroutine-safe - -#define s x19 - - -S2N_BN_SYMBOL(bignum_ksqr_16_32_neon): - -// Save registers, including return address - - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - stp x25, x30, [sp, #-16]! - -// Move parameters into subroutine-safe places - - mov z, x0 - mov x, x1 - mov t, x2 - -// Compute L = x_lo * y_lo in bottom half of buffer (size 8 x 8 -> 16) - - bl bignum_ksqr_16_32_neon_local_sqr_8_16 - -// Compute absolute difference [t..] = |x_lo - x_hi| - - ldp x10, x11, [x] - ldp x8, x9, [x, #64] - subs x10, x10, x8 - sbcs x11, x11, x9 - ldp x12, x13, [x, #16] - ldp x8, x9, [x, #80] - sbcs x12, x12, x8 - sbcs x13, x13, x9 - ldp x14, x15, [x, #32] - ldp x8, x9, [x, #96] - sbcs x14, x14, x8 - sbcs x15, x15, x9 - ldp x16, x17, [x, #48] - ldp x8, x9, [x, #112] - sbcs x16, x16, x8 - sbcs x17, x17, x9 - csetm s, cc - adds xzr, s, s - eor x10, x10, s - adcs x10, x10, xzr - eor x11, x11, s - adcs x11, x11, xzr - stp x10, x11, [t] - eor x12, x12, s - adcs x12, x12, xzr - eor x13, x13, s - adcs x13, x13, xzr - stp x12, x13, [t, #16] - eor x14, x14, s - adcs x14, x14, xzr - eor x15, x15, s - adcs x15, x15, xzr - stp x14, x15, [t, #32] - eor x16, x16, s - adcs x16, x16, xzr - eor x17, x17, s - adcs x17, x17, xzr - stp x16, x17, [t, #48] - -// Compute H = x_hi * y_hi in top half of buffer (size 8 x 8 -> 16) - - add x0, z, #128 - add x1, x, #64 - bl bignum_ksqr_16_32_neon_local_sqr_8_16 - -// Compute H' = H + L_top in place of H (it cannot overflow) -// First add 8-sized block then propagate carry through next 8 - - ldp x10, x11, [z, #128] - ldp x12, x13, [z, #64] - adds x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [z, #128] - - ldp x10, x11, [z, #128+16] - ldp x12, x13, [z, #64+16] - adcs x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [z, #128+16] - - ldp x10, x11, [z, #128+32] - ldp x12, x13, [z, #64+32] - adcs x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [z, #128+32] - - ldp x10, x11, [z, #128+48] - ldp x12, x13, [z, #64+48] - adcs x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [z, #128+48] - - ldp x10, x11, [z, #128+64] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [z, #128+64] - - ldp x10, x11, [z, #128+80] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [z, #128+80] - - ldp x10, x11, [z, #128+96] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [z, #128+96] - - ldp x10, x11, [z, #128+112] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [z, #128+112] - -// Compute M = |x_lo - x_hi| * |y_hi - y_lo| in [t+8...], size 16 - - add x0, t, #64 - mov x1, t - bl bignum_ksqr_16_32_neon_local_sqr_8_16 - -// Add the interlocking H' and L_bot terms, storing in registers x15..x0 -// Intercept the carry at the 8 + 16 = 24 position and store it in x. -// (Note that we no longer need the input x was pointing at.) - - ldp x0, x1, [z] - ldp x16, x17, [z, #128] - adds x0, x0, x16 - adcs x1, x1, x17 - ldp x2, x3, [z, #16] - ldp x16, x17, [z, #144] - adcs x2, x2, x16 - adcs x3, x3, x17 - ldp x4, x5, [z, #32] - ldp x16, x17, [z, #160] - adcs x4, x4, x16 - adcs x5, x5, x17 - ldp x6, x7, [z, #48] - ldp x16, x17, [z, #176] - adcs x6, x6, x16 - adcs x7, x7, x17 - ldp x8, x9, [z, #128] - ldp x16, x17, [z, #192] - adcs x8, x8, x16 - adcs x9, x9, x17 - ldp x10, x11, [z, #144] - ldp x16, x17, [z, #208] - adcs x10, x10, x16 - adcs x11, x11, x17 - ldp x12, x13, [z, #160] - ldp x16, x17, [z, #224] - adcs x12, x12, x16 - adcs x13, x13, x17 - ldp x14, x15, [z, #176] - ldp x16, x17, [z, #240] - adcs x14, x14, x16 - adcs x15, x15, x17 - cset x, cs - -// Subtract the mid-term cross product M - - ldp x16, x17, [t, #64] - subs x0, x0, x16 - sbcs x1, x1, x17 - stp x0, x1, [z, #64] - ldp x16, x17, [t, #80] - sbcs x2, x2, x16 - sbcs x3, x3, x17 - stp x2, x3, [z, #80] - ldp x16, x17, [t, #96] - sbcs x4, x4, x16 - sbcs x5, x5, x17 - stp x4, x5, [z, #96] - ldp x16, x17, [t, #112] - sbcs x6, x6, x16 - sbcs x7, x7, x17 - stp x6, x7, [z, #112] - ldp x16, x17, [t, #128] - sbcs x8, x8, x16 - sbcs x9, x9, x17 - stp x8, x9, [z, #128] - ldp x16, x17, [t, #144] - sbcs x10, x10, x16 - sbcs x11, x11, x17 - stp x10, x11, [z, #144] - ldp x16, x17, [t, #160] - sbcs x12, x12, x16 - sbcs x13, x13, x17 - stp x12, x13, [z, #160] - ldp x16, x17, [t, #176] - sbcs x14, x14, x16 - sbcs x15, x15, x17 - stp x14, x15, [z, #176] - -// Get the next digits effectively resulting so far starting at 24 - - sbcs x, x, xzr - csetm t, cc - -// Now the final 8 digits of padding; the first one is special in using x -// and also in getting the carry chain started - - ldp x10, x11, [z, #192] - adds x10, x10, x - adcs x11, x11, t - stp x10, x11, [z, #192] - ldp x10, x11, [z, #208] - adcs x10, x10, t - adcs x11, x11, t - stp x10, x11, [z, #208] - ldp x10, x11, [z, #224] - adcs x10, x10, t - adcs x11, x11, t - stp x10, x11, [z, #224] - ldp x10, x11, [z, #240] - adcs x10, x10, t - adcs x11, x11, t - stp x10, x11, [z, #240] - -// Restore registers and return - - ldp x25, x30, [sp], #16 - ldp x23, x24, [sp], #16 - ldp x21, x22, [sp], #16 - ldp x19, x20, [sp], #16 - - ret - -// ----------------------------------------------------------------------------- -// Local 8x8->16 squaring routine, shared to reduce code size. Effectively -// the same as bignum_sqr_8_16_neon without the scratch register preservation. -// ----------------------------------------------------------------------------- - -bignum_ksqr_16_32_neon_local_sqr_8_16: -// Load registers. - ldp x2, x3, [x1] -ldr q20, [x1] - ldp x4, x5, [x1, #16] -ldr q21, [x1, #16] - ldp x6, x7, [x1, #32] -ldr q22, [x1, #32] - ldp x8, x9, [x1, #48] -ldr q23, [x1, #48] -movi v30.2d, #0xffffffff - - mul x17, x2, x4 - mul x14, x3, x5 - -// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8 -// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3) -ext v1.16b, v20.16b, v20.16b, #8 - umulh x20, x2, x4 -shrn v2.2s, v20.2d, #32 - subs x21, x2, x3 -zip1 v0.2s, v20.2s, v1.2s - cneg x21, x21, cc // cc = lo, ul, last -umull v5.2d, v2.2s, v2.2s - csetm x11, cc // cc = lo, ul, last -umull v6.2d, v2.2s, v0.2s - subs x12, x5, x4 -umull v3.2d, v0.2s, v0.2s - cneg x12, x12, cc // cc = lo, ul, last -mov v1.16b, v6.16b - mul x13, x21, x12 -usra v1.2d, v3.2d, #32 - umulh x12, x21, x12 -and v4.16b, v1.16b, v30.16b - cinv x11, x11, cc // cc = lo, ul, last -add v4.2d, v4.2d, v6.2d - eor x13, x13, x11 -usra v5.2d, v4.2d, #32 - eor x12, x12, x11 -sli v3.2d, v4.2d, #32 - adds x19, x17, x20 -usra v5.2d, v1.2d, #32 - adc x20, x20, xzr - // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5) - ext v1.16b, v21.16b, v21.16b, #8 - umulh x21, x3, x5 - shrn v2.2s, v21.2d, #32 - adds x19, x19, x14 - zip1 v0.2s, v21.2s, v1.2s - adcs x20, x20, x21 - adc x21, x21, xzr - adds x20, x20, x14 - adc x21, x21, xzr - cmn x11, #0x1 - adcs x19, x19, x13 -mov x13, v3.d[1] // mul x13, x3, x3 - adcs x20, x20, x12 -mov x14, v5.d[1] // umulh x14, x3, x3 - adc x21, x21, x11 -mov x12, v3.d[0] // mul x12, x2, x2 - adds x17, x17, x17 -mov x11, v5.d[0] // umulh x11, x2, x2 - adcs x19, x19, x19 - umull v5.2d, v2.2s, v2.2s - adcs x20, x20, x20 - umull v6.2d, v2.2s, v0.2s - adcs x21, x21, x21 - umull v3.2d, v0.2s, v0.2s - adc x10, xzr, xzr - mov v1.16b, v6.16b - - mul x15, x2, x3 - usra v1.2d, v3.2d, #32 - umulh x16, x2, x3 - and v4.16b, v1.16b, v30.16b - adds x11, x11, x15 - add v4.2d, v4.2d, v6.2d - adcs x13, x13, x16 - usra v5.2d, v4.2d, #32 - adc x14, x14, xzr - sli v3.2d, v4.2d, #32 - adds x11, x11, x15 - usra v5.2d, v1.2d, #32 - adcs x13, x13, x16 - adc x14, x14, xzr - stp x12, x11, [x0] - mov x11, v5.d[0] // umulh x11, x4, x4 - adds x17, x17, x13 - mov x13, v3.d[1] // mul x13, x5, x5 - adcs x19, x19, x14 - mov x14, v5.d[1] // umulh x14, x5, x5 - adcs x20, x20, xzr - mov x12, v3.d[0] // mul x12, x4, x4 - adcs x21, x21, xzr -// NEON: prepare muls in the upper half -ext v1.16b, v22.16b, v22.16b, #8 - adc x10, x10, xzr -shrn v2.2s, v22.2d, #32 - stp x17, x19, [x0, #16] -zip1 v0.2s, v22.2s, v1.2s - mul x15, x4, x5 -umull v5.2d, v2.2s, v2.2s - umulh x16, x4, x5 -umull v6.2d, v2.2s, v0.2s - adds x11, x11, x15 -umull v3.2d, v0.2s, v0.2s - adcs x13, x13, x16 -mov v1.16b, v6.16b - adc x14, x14, xzr -usra v1.2d, v3.2d, #32 - adds x11, x11, x15 -and v4.16b, v1.16b, v30.16b - adcs x13, x13, x16 -add v4.2d, v4.2d, v6.2d - adc x14, x14, xzr -usra v5.2d, v4.2d, #32 - adds x12, x12, x20 -sli v3.2d, v4.2d, #32 - adcs x11, x11, x21 -usra v5.2d, v1.2d, #32 - stp x12, x11, [x0, #32] - // NEON: prepare muls in the upper half - ext v1.16b, v23.16b, v23.16b, #8 - adcs x13, x13, x10 - shrn v2.2s, v23.2d, #32 - adc x14, x14, xzr - zip1 v0.2s, v23.2s, v1.2s - stp x13, x14, [x0, #48] - -// Scalar: square the upper half with a slight variant of the previous block - mul x17, x6, x8 - umull v16.2d, v2.2s, v2.2s - mul x14, x7, x9 - umull v6.2d, v2.2s, v0.2s - umulh x20, x6, x8 - umull v18.2d, v0.2s, v0.2s - subs x21, x6, x7 - cneg x21, x21, cc // cc = lo, ul, last - mov v1.16b, v6.16b - csetm x11, cc // cc = lo, ul, last - subs x12, x9, x8 - cneg x12, x12, cc // cc = lo, ul, last - usra v1.2d, v18.2d, #32 - mul x13, x21, x12 - and v4.16b, v1.16b, v30.16b - umulh x12, x21, x12 - add v4.2d, v4.2d, v6.2d - cinv x11, x11, cc // cc = lo, ul, last - eor x13, x13, x11 - eor x12, x12, x11 - usra v16.2d, v4.2d, #32 - adds x19, x17, x20 - adc x20, x20, xzr - sli v18.2d, v4.2d, #32 - umulh x21, x7, x9 - adds x19, x19, x14 - adcs x20, x20, x21 - adc x21, x21, xzr - adds x20, x20, x14 -mov x14, v5.d[1] - adc x21, x21, xzr - cmn x11, #0x1 - adcs x19, x19, x13 -mov x13, v3.d[1] - adcs x20, x20, x12 -mov x12, v3.d[0] - adc x21, x21, x11 -mov x11, v5.d[0] - adds x17, x17, x17 - adcs x19, x19, x19 - usra v16.2d, v1.2d, #32 - adcs x20, x20, x20 - adcs x21, x21, x21 - adc x10, xzr, xzr -// NEON: two mul+umulhs for the next stage -uzp2 v17.4s, v21.4s, v23.4s - mul x15, x6, x7 -xtn v4.2s, v23.2d - umulh x16, x6, x7 - mov x22, v16.d[0] - adds x11, x11, x15 - adcs x13, x13, x16 -xtn v5.2s, v21.2d - adc x14, x14, xzr - adds x11, x11, x15 -rev64 v1.4s, v21.4s - adcs x13, x13, x16 - adc x14, x14, xzr - stp x12, x11, [x0, #64] - adds x17, x17, x13 - mov x13, v18.d[1] - adcs x19, x19, x14 - mov x14, v16.d[1] - adcs x20, x20, xzr - mov x12, v18.d[0] - adcs x21, x21, xzr - adc x10, x10, xzr -umull v6.2d, v4.2s, v5.2s - stp x17, x19, [x0, #80] -umull v7.2d, v4.2s, v17.2s - mul x15, x8, x9 -uzp2 v16.4s, v23.4s, v23.4s - umulh x16, x8, x9 -mul v0.4s, v1.4s, v23.4s - adds x11, x22, x15 - adcs x13, x13, x16 -usra v7.2d, v6.2d, #32 - adc x14, x14, xzr - adds x11, x11, x15 -umull v1.2d, v16.2s, v17.2s - adcs x13, x13, x16 - adc x14, x14, xzr -uaddlp v0.2d, v0.4s - adds x12, x12, x20 - adcs x11, x11, x21 -and v2.16b, v7.16b, v30.16b -umlal v2.2d, v16.2s, v5.2s -shl v0.2d, v0.2d, #32 -usra v1.2d, v7.2d, #32 -umlal v0.2d, v4.2s, v5.2s -mov x16, v0.d[1] -mov x15, v0.d[0] -usra v1.2d, v2.2d, #32 -mov x20, v1.d[0] -mov x21, v1.d[1] - stp x12, x11, [x0, #96] - adcs x13, x13, x10 - adc x14, x14, xzr - stp x13, x14, [x0, #112] - -// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0] - - mul x10, x2, x6 - mul x14, x3, x7 - umulh x17, x2, x6 - adds x14, x14, x17 - umulh x17, x3, x7 - adcs x15, x15, x17 - adcs x16, x16, x20 - adc x17, x21, xzr - adds x11, x14, x10 - adcs x14, x15, x14 - adcs x15, x16, x15 - adcs x16, x17, x16 - adc x17, xzr, x17 - adds x12, x14, x10 - adcs x13, x15, x11 - adcs x14, x16, x14 - adcs x15, x17, x15 - adcs x16, xzr, x16 - adc x17, xzr, x17 - subs x22, x4, x5 - cneg x22, x22, cc // cc = lo, ul, last - csetm x19, cc // cc = lo, ul, last - subs x20, x9, x8 - cneg x20, x20, cc // cc = lo, ul, last - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc // cc = lo, ul, last - cmn x19, #0x1 - eor x21, x21, x19 - adcs x15, x15, x21 - eor x20, x20, x19 - adcs x16, x16, x20 - adc x17, x17, x19 - subs x22, x2, x3 - cneg x22, x22, cc // cc = lo, ul, last - csetm x19, cc // cc = lo, ul, last - subs x20, x7, x6 - cneg x20, x20, cc // cc = lo, ul, last - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc // cc = lo, ul, last - cmn x19, #0x1 - eor x21, x21, x19 - adcs x11, x11, x21 - eor x20, x20, x19 - adcs x12, x12, x20 - adcs x13, x13, x19 - adcs x14, x14, x19 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x3, x5 - cneg x22, x22, cc // cc = lo, ul, last - csetm x19, cc // cc = lo, ul, last - subs x20, x9, x7 - cneg x20, x20, cc // cc = lo, ul, last - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc // cc = lo, ul, last - cmn x19, #0x1 - eor x21, x21, x19 - adcs x14, x14, x21 - eor x20, x20, x19 - adcs x15, x15, x20 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x2, x4 - cneg x22, x22, cc // cc = lo, ul, last - csetm x19, cc // cc = lo, ul, last - subs x20, x8, x6 - cneg x20, x20, cc // cc = lo, ul, last - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc // cc = lo, ul, last - cmn x19, #0x1 - eor x21, x21, x19 - adcs x12, x12, x21 - eor x20, x20, x19 - adcs x13, x13, x20 - adcs x14, x14, x19 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x2, x5 - cneg x22, x22, cc // cc = lo, ul, last - csetm x19, cc // cc = lo, ul, last - subs x20, x9, x6 - cneg x20, x20, cc // cc = lo, ul, last - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc // cc = lo, ul, last - cmn x19, #0x1 - eor x21, x21, x19 - adcs x13, x13, x21 - eor x20, x20, x19 - adcs x14, x14, x20 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x3, x4 - cneg x22, x22, cc // cc = lo, ul, last - csetm x19, cc // cc = lo, ul, last - subs x20, x8, x7 - cneg x20, x20, cc // cc = lo, ul, last - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc // cc = lo, ul, last - cmn x19, #0x1 - eor x21, x21, x19 - adcs x13, x13, x21 - eor x20, x20, x19 - adcs x14, x14, x20 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - adds x10, x10, x10 - adcs x11, x11, x11 - adcs x12, x12, x12 - adcs x13, x13, x13 - adcs x14, x14, x14 - adcs x15, x15, x15 - adcs x16, x16, x16 - adcs x17, x17, x17 - adc x19, xzr, xzr - -// Add it back to the buffer - - ldp x2, x3, [x0, #32] - adds x10, x10, x2 - adcs x11, x11, x3 - stp x10, x11, [x0, #32] - - ldp x2, x3, [x0, #48] - adcs x12, x12, x2 - adcs x13, x13, x3 - stp x12, x13, [x0, #48] - - ldp x2, x3, [x0, #64] - adcs x14, x14, x2 - adcs x15, x15, x3 - stp x14, x15, [x0, #64] - - ldp x2, x3, [x0, #80] - adcs x16, x16, x2 - adcs x17, x17, x3 - stp x16, x17, [x0, #80] - - ldp x2, x3, [x0, #96] - adcs x2, x2, x19 - adcs x3, x3, xzr - stp x2, x3, [x0, #96] - - ldp x2, x3, [x0, #112] - adcs x2, x2, xzr - adc x3, x3, xzr - stp x2, x3, [x0, #112] - - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_32_64_neon.S b/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_32_64_neon.S deleted file mode 100644 index 04197642339..00000000000 --- a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_32_64_neon.S +++ /dev/null @@ -1,1075 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Square, z := x^2 -// Input x[32]; output z[64]; temporary buffer t[>=72] -// -// extern void bignum_ksqr_32_64_neon -// (uint64_t z[static 64], uint64_t x[static 32], uint64_t t[static 72]); -// -// This is a Karatsuba-style function squaring half-sized results -// and using temporary buffer t for intermediate results. -// -// Standard ARM ABI: X0 = z, X1 = x, X2 = t -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ksqr_32_64_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ksqr_32_64_neon) - .text - .balign 4 - -#define K 16 -#define L 8 // (K/2) - -#define z x19 -#define x x20 -#define t x21 - -#define c x16 - - -S2N_BN_SYMBOL(bignum_ksqr_32_64_neon): - -// Save extra registers and return address, store parameters safely - - stp x19, x20, [sp, #-16]! - stp x21, x30, [sp, #-16]! - - mov z, x0 - mov x, x1 - mov t, x2 - -// Compute L = x_lo * y_lo in bottom half of buffer (size 16 x 16 -> 32) - - bl bignum_ksqr_32_64_neon_local_ksqr_16_32 - -// Compute H = x_hi * y_hi in top half of buffer (size 16 x 16 -> 32) - - add x0, z, #16*K - add x1, x, #8*K - mov x2, t - bl bignum_ksqr_32_64_neon_local_ksqr_16_32 - -// Compute H' = H + L_top in place of H (it cannot overflow) - - ldp x0, x1, [z, #16*16] - ldp x2, x3, [z, #16*8] - adds x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*16] - - ldp x0, x1, [z, #16*17] - ldp x2, x3, [z, #16*9] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*17] - - ldp x0, x1, [z, #16*18] - ldp x2, x3, [z, #16*10] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*18] - - ldp x0, x1, [z, #16*19] - ldp x2, x3, [z, #16*11] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*19] - - ldp x0, x1, [z, #16*20] - ldp x2, x3, [z, #16*12] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*20] - - ldp x0, x1, [z, #16*21] - ldp x2, x3, [z, #16*13] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*21] - - ldp x0, x1, [z, #16*22] - ldp x2, x3, [z, #16*14] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*22] - - ldp x0, x1, [z, #16*23] - ldp x2, x3, [z, #16*15] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*23] - - ldp x0, x1, [z, #16*24] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*24] - - ldp x0, x1, [z, #16*25] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*25] - - ldp x0, x1, [z, #16*26] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*26] - - ldp x0, x1, [z, #16*27] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*27] - - ldp x0, x1, [z, #16*28] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*28] - - ldp x0, x1, [z, #16*29] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*29] - - ldp x0, x1, [z, #16*30] - adcs x0, x0, xzr - adcs x1, x1, xzr - stp x0, x1, [z, #16*30] - - ldp x0, x1, [z, #16*31] - adcs x0, x0, xzr - adc x1, x1, xzr - stp x0, x1, [z, #16*31] - -// Compute absolute difference [t..] = |x_lo - x_hi| - - ldp x0, x1, [x, #128] - ldp x16, x17, [x] - subs x0, x0, x16 - sbcs x1, x1, x17 - - ldp x2, x3, [x, #144] - ldp x16, x17, [x, #16] - sbcs x2, x2, x16 - sbcs x3, x3, x17 - - ldp x4, x5, [x, #160] - ldp x16, x17, [x, #32] - sbcs x4, x4, x16 - sbcs x5, x5, x17 - - ldp x6, x7, [x, #176] - ldp x16, x17, [x, #48] - sbcs x6, x6, x16 - sbcs x7, x7, x17 - - ldp x8, x9, [x, #192] - ldp x16, x17, [x, #64] - sbcs x8, x8, x16 - sbcs x9, x9, x17 - - ldp x10, x11, [x, #208] - ldp x16, x17, [x, #80] - sbcs x10, x10, x16 - sbcs x11, x11, x17 - - ldp x12, x13, [x, #224] - ldp x16, x17, [x, #96] - sbcs x12, x12, x16 - sbcs x13, x13, x17 - - ldp x14, x15, [x, #240] - ldp x16, x17, [x, #112] - sbcs x14, x14, x16 - sbcs x15, x15, x17 - - sbc c, xzr, xzr - - adds xzr, c, c - - eor x0, x0, c - adcs x0, x0, xzr - eor x1, x1, c - adcs x1, x1, xzr - stp x0, x1, [t] - - eor x2, x2, c - adcs x2, x2, xzr - eor x3, x3, c - adcs x3, x3, xzr - stp x2, x3, [t, #16] - - eor x4, x4, c - adcs x4, x4, xzr - eor x5, x5, c - adcs x5, x5, xzr - stp x4, x5, [t, #32] - - eor x6, x6, c - adcs x6, x6, xzr - eor x7, x7, c - adcs x7, x7, xzr - stp x6, x7, [t, #48] - - eor x8, x8, c - adcs x8, x8, xzr - eor x9, x9, c - adcs x9, x9, xzr - stp x8, x9, [t, #64] - - eor x10, x10, c - adcs x10, x10, xzr - eor x11, x11, c - adcs x11, x11, xzr - stp x10, x11, [t, #80] - - eor x12, x12, c - adcs x12, x12, xzr - eor x13, x13, c - adcs x13, x13, xzr - stp x12, x13, [t, #96] - - eor x14, x14, c - adcs x14, x14, xzr - eor x15, x15, c - adc x15, x15, xzr - stp x14, x15, [t, #112] - -// Compute M = |x_lo - x_hi|^2, size 32 - - add x0, t, #8*K - mov x1, t - add x2, t, #24*K - bl bignum_ksqr_32_64_neon_local_ksqr_16_32 - -// Add the interlocking H' and L_bot terms -// Intercept the carry at the 3k position and store it in x. -// (Note that we no longer need the input x was pointing at.) - - ldp x0, x1, [z, #16*16] - ldp x2, x3, [z] - adds x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*8] - - ldp x0, x1, [z, #16*17] - ldp x2, x3, [z, #16*1] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*9] - - ldp x0, x1, [z, #16*18] - ldp x2, x3, [z, #16*2] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*10] - - ldp x0, x1, [z, #16*19] - ldp x2, x3, [z, #16*3] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*11] - - ldp x0, x1, [z, #16*20] - ldp x2, x3, [z, #16*4] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*12] - - ldp x0, x1, [z, #16*21] - ldp x2, x3, [z, #16*5] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*13] - - ldp x0, x1, [z, #16*22] - ldp x2, x3, [z, #16*6] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*14] - - ldp x0, x1, [z, #16*23] - ldp x2, x3, [z, #16*7] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*15] - - ldp x0, x1, [z, #16*16] - ldp x2, x3, [z, #16*24] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*16] - - ldp x0, x1, [z, #16*17] - ldp x2, x3, [z, #16*25] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*17] - - ldp x0, x1, [z, #16*18] - ldp x2, x3, [z, #16*26] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*18] - - ldp x0, x1, [z, #16*19] - ldp x2, x3, [z, #16*27] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*19] - - ldp x0, x1, [z, #16*20] - ldp x2, x3, [z, #16*28] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*20] - - ldp x0, x1, [z, #16*21] - ldp x2, x3, [z, #16*29] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*21] - - ldp x0, x1, [z, #16*22] - ldp x2, x3, [z, #16*30] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*22] - - ldp x0, x1, [z, #16*23] - ldp x2, x3, [z, #16*31] - adcs x0, x0, x2 - adcs x1, x1, x3 - stp x0, x1, [z, #16*23] - - cset x, cs - -// Subtract the mid-term cross product M - - ldp x0, x1, [z, #16*L] - ldp x2, x3, [t, #16*L] - subs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*L] - - ldp x0, x1, [z, #16*9] - ldp x2, x3, [t, #16*9] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*9] - - ldp x0, x1, [z, #16*10] - ldp x2, x3, [t, #16*10] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*10] - - ldp x0, x1, [z, #16*11] - ldp x2, x3, [t, #16*11] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*11] - - ldp x0, x1, [z, #16*12] - ldp x2, x3, [t, #16*12] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*12] - - ldp x0, x1, [z, #16*13] - ldp x2, x3, [t, #16*13] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*13] - - ldp x0, x1, [z, #16*14] - ldp x2, x3, [t, #16*14] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*14] - - ldp x0, x1, [z, #16*15] - ldp x2, x3, [t, #16*15] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*15] - - ldp x0, x1, [z, #16*16] - ldp x2, x3, [t, #16*16] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*16] - - ldp x0, x1, [z, #16*17] - ldp x2, x3, [t, #16*17] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*17] - - ldp x0, x1, [z, #16*18] - ldp x2, x3, [t, #16*18] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*18] - - ldp x0, x1, [z, #16*19] - ldp x2, x3, [t, #16*19] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*19] - - ldp x0, x1, [z, #16*20] - ldp x2, x3, [t, #16*20] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*20] - - ldp x0, x1, [z, #16*21] - ldp x2, x3, [t, #16*21] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*21] - - ldp x0, x1, [z, #16*22] - ldp x2, x3, [t, #16*22] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*22] - - ldp x0, x1, [z, #16*23] - ldp x2, x3, [t, #16*23] - sbcs x0, x0, x2 - sbcs x1, x1, x3 - stp x0, x1, [z, #16*23] - -// Get the next digits effectively resulting so far starting at 3k -// [...,c,c,c,c,x] - - sbcs x, x, xzr - csetm c, cc - -// Now propagate through the top quarter of the result - - ldp x0, x1, [z, #16*24] - adds x0, x0, x - adcs x1, x1, c - stp x0, x1, [z, #16*24] - - ldp x0, x1, [z, #16*25] - adcs x0, x0, c - adcs x1, x1, c - stp x0, x1, [z, #16*25] - - ldp x0, x1, [z, #16*26] - adcs x0, x0, c - adcs x1, x1, c - stp x0, x1, [z, #16*26] - - ldp x0, x1, [z, #16*27] - adcs x0, x0, c - adcs x1, x1, c - stp x0, x1, [z, #16*27] - - ldp x0, x1, [z, #16*28] - adcs x0, x0, c - adcs x1, x1, c - stp x0, x1, [z, #16*28] - - ldp x0, x1, [z, #16*29] - adcs x0, x0, c - adcs x1, x1, c - stp x0, x1, [z, #16*29] - - ldp x0, x1, [z, #16*30] - adcs x0, x0, c - adcs x1, x1, c - stp x0, x1, [z, #16*30] - - ldp x0, x1, [z, #16*31] - adcs x0, x0, c - adc x1, x1, c - stp x0, x1, [z, #16*31] - -// Restore - - ldp x21, x30, [sp], #16 - ldp x19, x20, [sp], #16 - - ret - -// Local copy of bignum_ksqr_16_32, identical to main one. -// This includes in turn a copy of bignum_sqr_8_16. - -bignum_ksqr_32_64_neon_local_ksqr_16_32: - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - stp x25, x30, [sp, #-16]! - mov x23, x0 - mov x24, x1 - mov x25, x2 - bl bignum_ksqr_32_64_neon_local_sqr_8_16 - ldp x10, x11, [x24] - ldp x8, x9, [x24, #64] - subs x10, x10, x8 - sbcs x11, x11, x9 - ldp x12, x13, [x24, #16] - ldp x8, x9, [x24, #80] - sbcs x12, x12, x8 - sbcs x13, x13, x9 - ldp x14, x15, [x24, #32] - ldp x8, x9, [x24, #96] - sbcs x14, x14, x8 - sbcs x15, x15, x9 - ldp x16, x17, [x24, #48] - ldp x8, x9, [x24, #112] - sbcs x16, x16, x8 - sbcs x17, x17, x9 - csetm x19, cc - cmn x19, x19 - eor x10, x10, x19 - adcs x10, x10, xzr - eor x11, x11, x19 - adcs x11, x11, xzr - stp x10, x11, [x25] - eor x12, x12, x19 - adcs x12, x12, xzr - eor x13, x13, x19 - adcs x13, x13, xzr - stp x12, x13, [x25, #16] - eor x14, x14, x19 - adcs x14, x14, xzr - eor x15, x15, x19 - adcs x15, x15, xzr - stp x14, x15, [x25, #32] - eor x16, x16, x19 - adcs x16, x16, xzr - eor x17, x17, x19 - adcs x17, x17, xzr - stp x16, x17, [x25, #48] - add x0, x23, #0x80 - add x1, x24, #0x40 - bl bignum_ksqr_32_64_neon_local_sqr_8_16 - ldp x10, x11, [x23, #128] - ldp x12, x13, [x23, #64] - adds x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [x23, #128] - ldp x10, x11, [x23, #144] - ldp x12, x13, [x23, #80] - adcs x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [x23, #144] - ldp x10, x11, [x23, #160] - ldp x12, x13, [x23, #96] - adcs x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [x23, #160] - ldp x10, x11, [x23, #176] - ldp x12, x13, [x23, #112] - adcs x10, x10, x12 - adcs x11, x11, x13 - stp x10, x11, [x23, #176] - ldp x10, x11, [x23, #192] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [x23, #192] - ldp x10, x11, [x23, #208] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [x23, #208] - ldp x10, x11, [x23, #224] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [x23, #224] - ldp x10, x11, [x23, #240] - adcs x10, x10, xzr - adcs x11, x11, xzr - stp x10, x11, [x23, #240] - add x0, x25, #0x40 - mov x1, x25 - bl bignum_ksqr_32_64_neon_local_sqr_8_16 - ldp x0, x1, [x23] - ldp x16, x17, [x23, #128] - adds x0, x0, x16 - adcs x1, x1, x17 - ldp x2, x3, [x23, #16] - ldp x16, x17, [x23, #144] - adcs x2, x2, x16 - adcs x3, x3, x17 - ldp x4, x5, [x23, #32] - ldp x16, x17, [x23, #160] - adcs x4, x4, x16 - adcs x5, x5, x17 - ldp x6, x7, [x23, #48] - ldp x16, x17, [x23, #176] - adcs x6, x6, x16 - adcs x7, x7, x17 - ldp x8, x9, [x23, #128] - ldp x16, x17, [x23, #192] - adcs x8, x8, x16 - adcs x9, x9, x17 - ldp x10, x11, [x23, #144] - ldp x16, x17, [x23, #208] - adcs x10, x10, x16 - adcs x11, x11, x17 - ldp x12, x13, [x23, #160] - ldp x16, x17, [x23, #224] - adcs x12, x12, x16 - adcs x13, x13, x17 - ldp x14, x15, [x23, #176] - ldp x16, x17, [x23, #240] - adcs x14, x14, x16 - adcs x15, x15, x17 - cset x24, cs - ldp x16, x17, [x25, #64] - subs x0, x0, x16 - sbcs x1, x1, x17 - stp x0, x1, [x23, #64] - ldp x16, x17, [x25, #80] - sbcs x2, x2, x16 - sbcs x3, x3, x17 - stp x2, x3, [x23, #80] - ldp x16, x17, [x25, #96] - sbcs x4, x4, x16 - sbcs x5, x5, x17 - stp x4, x5, [x23, #96] - ldp x16, x17, [x25, #112] - sbcs x6, x6, x16 - sbcs x7, x7, x17 - stp x6, x7, [x23, #112] - ldp x16, x17, [x25, #128] - sbcs x8, x8, x16 - sbcs x9, x9, x17 - stp x8, x9, [x23, #128] - ldp x16, x17, [x25, #144] - sbcs x10, x10, x16 - sbcs x11, x11, x17 - stp x10, x11, [x23, #144] - ldp x16, x17, [x25, #160] - sbcs x12, x12, x16 - sbcs x13, x13, x17 - stp x12, x13, [x23, #160] - ldp x16, x17, [x25, #176] - sbcs x14, x14, x16 - sbcs x15, x15, x17 - stp x14, x15, [x23, #176] - sbcs x24, x24, xzr - csetm x25, cc - ldp x10, x11, [x23, #192] - adds x10, x10, x24 - adcs x11, x11, x25 - stp x10, x11, [x23, #192] - ldp x10, x11, [x23, #208] - adcs x10, x10, x25 - adcs x11, x11, x25 - stp x10, x11, [x23, #208] - ldp x10, x11, [x23, #224] - adcs x10, x10, x25 - adcs x11, x11, x25 - stp x10, x11, [x23, #224] - ldp x10, x11, [x23, #240] - adcs x10, x10, x25 - adcs x11, x11, x25 - stp x10, x11, [x23, #240] - ldp x25, x30, [sp], #16 - ldp x23, x24, [sp], #16 - ldp x21, x22, [sp], #16 - ldp x19, x20, [sp], #16 - ret - -bignum_ksqr_32_64_neon_local_sqr_8_16: -// Load registers. - ldp x2, x3, [x1] -ldr q20, [x1] - ldp x4, x5, [x1, #16] -ldr q21, [x1, #16] - ldp x6, x7, [x1, #32] -ldr q22, [x1, #32] - ldp x8, x9, [x1, #48] -ldr q23, [x1, #48] -movi v30.2d, #0xffffffff - - mul x17, x2, x4 - mul x14, x3, x5 - -// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8 -// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3) -ext v1.16b, v20.16b, v20.16b, #8 - umulh x20, x2, x4 -shrn v2.2s, v20.2d, #32 - subs x21, x2, x3 -zip1 v0.2s, v20.2s, v1.2s - cneg x21, x21, cc // cc = lo, ul, last -umull v5.2d, v2.2s, v2.2s - csetm x11, cc // cc = lo, ul, last -umull v6.2d, v2.2s, v0.2s - subs x12, x5, x4 -umull v3.2d, v0.2s, v0.2s - cneg x12, x12, cc // cc = lo, ul, last -mov v1.16b, v6.16b - mul x13, x21, x12 -usra v1.2d, v3.2d, #32 - umulh x12, x21, x12 -and v4.16b, v1.16b, v30.16b - cinv x11, x11, cc // cc = lo, ul, last -add v4.2d, v4.2d, v6.2d - eor x13, x13, x11 -usra v5.2d, v4.2d, #32 - eor x12, x12, x11 -sli v3.2d, v4.2d, #32 - adds x19, x17, x20 -usra v5.2d, v1.2d, #32 - adc x20, x20, xzr - // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5) - ext v1.16b, v21.16b, v21.16b, #8 - umulh x21, x3, x5 - shrn v2.2s, v21.2d, #32 - adds x19, x19, x14 - zip1 v0.2s, v21.2s, v1.2s - adcs x20, x20, x21 - adc x21, x21, xzr - adds x20, x20, x14 - adc x21, x21, xzr - cmn x11, #0x1 - adcs x19, x19, x13 -mov x13, v3.d[1] // mul x13, x3, x3 - adcs x20, x20, x12 -mov x14, v5.d[1] // umulh x14, x3, x3 - adc x21, x21, x11 -mov x12, v3.d[0] // mul x12, x2, x2 - adds x17, x17, x17 -mov x11, v5.d[0] // umulh x11, x2, x2 - adcs x19, x19, x19 - umull v5.2d, v2.2s, v2.2s - adcs x20, x20, x20 - umull v6.2d, v2.2s, v0.2s - adcs x21, x21, x21 - umull v3.2d, v0.2s, v0.2s - adc x10, xzr, xzr - mov v1.16b, v6.16b - - mul x15, x2, x3 - usra v1.2d, v3.2d, #32 - umulh x16, x2, x3 - and v4.16b, v1.16b, v30.16b - adds x11, x11, x15 - add v4.2d, v4.2d, v6.2d - adcs x13, x13, x16 - usra v5.2d, v4.2d, #32 - adc x14, x14, xzr - sli v3.2d, v4.2d, #32 - adds x11, x11, x15 - usra v5.2d, v1.2d, #32 - adcs x13, x13, x16 - adc x14, x14, xzr - stp x12, x11, [x0] - mov x11, v5.d[0] // umulh x11, x4, x4 - adds x17, x17, x13 - mov x13, v3.d[1] // mul x13, x5, x5 - adcs x19, x19, x14 - mov x14, v5.d[1] // umulh x14, x5, x5 - adcs x20, x20, xzr - mov x12, v3.d[0] // mul x12, x4, x4 - adcs x21, x21, xzr -// NEON: prepare muls in the upper half -ext v1.16b, v22.16b, v22.16b, #8 - adc x10, x10, xzr -shrn v2.2s, v22.2d, #32 - stp x17, x19, [x0, #16] -zip1 v0.2s, v22.2s, v1.2s - mul x15, x4, x5 -umull v5.2d, v2.2s, v2.2s - umulh x16, x4, x5 -umull v6.2d, v2.2s, v0.2s - adds x11, x11, x15 -umull v3.2d, v0.2s, v0.2s - adcs x13, x13, x16 -mov v1.16b, v6.16b - adc x14, x14, xzr -usra v1.2d, v3.2d, #32 - adds x11, x11, x15 -and v4.16b, v1.16b, v30.16b - adcs x13, x13, x16 -add v4.2d, v4.2d, v6.2d - adc x14, x14, xzr -usra v5.2d, v4.2d, #32 - adds x12, x12, x20 -sli v3.2d, v4.2d, #32 - adcs x11, x11, x21 -usra v5.2d, v1.2d, #32 - stp x12, x11, [x0, #32] - // NEON: prepare muls in the upper half - ext v1.16b, v23.16b, v23.16b, #8 - adcs x13, x13, x10 - shrn v2.2s, v23.2d, #32 - adc x14, x14, xzr - zip1 v0.2s, v23.2s, v1.2s - stp x13, x14, [x0, #48] - -// Scalar: square the upper half with a slight variant of the previous block - mul x17, x6, x8 - umull v16.2d, v2.2s, v2.2s - mul x14, x7, x9 - umull v6.2d, v2.2s, v0.2s - umulh x20, x6, x8 - umull v18.2d, v0.2s, v0.2s - subs x21, x6, x7 - cneg x21, x21, cc // cc = lo, ul, last - mov v1.16b, v6.16b - csetm x11, cc // cc = lo, ul, last - subs x12, x9, x8 - cneg x12, x12, cc // cc = lo, ul, last - usra v1.2d, v18.2d, #32 - mul x13, x21, x12 - and v4.16b, v1.16b, v30.16b - umulh x12, x21, x12 - add v4.2d, v4.2d, v6.2d - cinv x11, x11, cc // cc = lo, ul, last - eor x13, x13, x11 - eor x12, x12, x11 - usra v16.2d, v4.2d, #32 - adds x19, x17, x20 - adc x20, x20, xzr - sli v18.2d, v4.2d, #32 - umulh x21, x7, x9 - adds x19, x19, x14 - adcs x20, x20, x21 - adc x21, x21, xzr - adds x20, x20, x14 -mov x14, v5.d[1] - adc x21, x21, xzr - cmn x11, #0x1 - adcs x19, x19, x13 -mov x13, v3.d[1] - adcs x20, x20, x12 -mov x12, v3.d[0] - adc x21, x21, x11 -mov x11, v5.d[0] - adds x17, x17, x17 - adcs x19, x19, x19 - usra v16.2d, v1.2d, #32 - adcs x20, x20, x20 - adcs x21, x21, x21 - adc x10, xzr, xzr -// NEON: two mul+umulhs for the next stage -uzp2 v17.4s, v21.4s, v23.4s - mul x15, x6, x7 -xtn v4.2s, v23.2d - umulh x16, x6, x7 - mov x22, v16.d[0] - adds x11, x11, x15 - adcs x13, x13, x16 -xtn v5.2s, v21.2d - adc x14, x14, xzr - adds x11, x11, x15 -rev64 v1.4s, v21.4s - adcs x13, x13, x16 - adc x14, x14, xzr - stp x12, x11, [x0, #64] - adds x17, x17, x13 - mov x13, v18.d[1] - adcs x19, x19, x14 - mov x14, v16.d[1] - adcs x20, x20, xzr - mov x12, v18.d[0] - adcs x21, x21, xzr - adc x10, x10, xzr -umull v6.2d, v4.2s, v5.2s - stp x17, x19, [x0, #80] -umull v7.2d, v4.2s, v17.2s - mul x15, x8, x9 -uzp2 v16.4s, v23.4s, v23.4s - umulh x16, x8, x9 -mul v0.4s, v1.4s, v23.4s - adds x11, x22, x15 - adcs x13, x13, x16 -usra v7.2d, v6.2d, #32 - adc x14, x14, xzr - adds x11, x11, x15 -umull v1.2d, v16.2s, v17.2s - adcs x13, x13, x16 - adc x14, x14, xzr -uaddlp v0.2d, v0.4s - adds x12, x12, x20 - adcs x11, x11, x21 -and v2.16b, v7.16b, v30.16b -umlal v2.2d, v16.2s, v5.2s -shl v0.2d, v0.2d, #32 -usra v1.2d, v7.2d, #32 -umlal v0.2d, v4.2s, v5.2s -mov x16, v0.d[1] -mov x15, v0.d[0] -usra v1.2d, v2.2d, #32 -mov x20, v1.d[0] -mov x21, v1.d[1] - stp x12, x11, [x0, #96] - adcs x13, x13, x10 - adc x14, x14, xzr - stp x13, x14, [x0, #112] - -// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0] - - mul x10, x2, x6 - mul x14, x3, x7 - umulh x17, x2, x6 - adds x14, x14, x17 - umulh x17, x3, x7 - adcs x15, x15, x17 - adcs x16, x16, x20 - adc x17, x21, xzr - adds x11, x14, x10 - adcs x14, x15, x14 - adcs x15, x16, x15 - adcs x16, x17, x16 - adc x17, xzr, x17 - adds x12, x14, x10 - adcs x13, x15, x11 - adcs x14, x16, x14 - adcs x15, x17, x15 - adcs x16, xzr, x16 - adc x17, xzr, x17 - subs x22, x4, x5 - cneg x22, x22, cc // cc = lo, ul, last - csetm x19, cc // cc = lo, ul, last - subs x20, x9, x8 - cneg x20, x20, cc // cc = lo, ul, last - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc // cc = lo, ul, last - cmn x19, #0x1 - eor x21, x21, x19 - adcs x15, x15, x21 - eor x20, x20, x19 - adcs x16, x16, x20 - adc x17, x17, x19 - subs x22, x2, x3 - cneg x22, x22, cc // cc = lo, ul, last - csetm x19, cc // cc = lo, ul, last - subs x20, x7, x6 - cneg x20, x20, cc // cc = lo, ul, last - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc // cc = lo, ul, last - cmn x19, #0x1 - eor x21, x21, x19 - adcs x11, x11, x21 - eor x20, x20, x19 - adcs x12, x12, x20 - adcs x13, x13, x19 - adcs x14, x14, x19 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x3, x5 - cneg x22, x22, cc // cc = lo, ul, last - csetm x19, cc // cc = lo, ul, last - subs x20, x9, x7 - cneg x20, x20, cc // cc = lo, ul, last - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc // cc = lo, ul, last - cmn x19, #0x1 - eor x21, x21, x19 - adcs x14, x14, x21 - eor x20, x20, x19 - adcs x15, x15, x20 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x2, x4 - cneg x22, x22, cc // cc = lo, ul, last - csetm x19, cc // cc = lo, ul, last - subs x20, x8, x6 - cneg x20, x20, cc // cc = lo, ul, last - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc // cc = lo, ul, last - cmn x19, #0x1 - eor x21, x21, x19 - adcs x12, x12, x21 - eor x20, x20, x19 - adcs x13, x13, x20 - adcs x14, x14, x19 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x2, x5 - cneg x22, x22, cc // cc = lo, ul, last - csetm x19, cc // cc = lo, ul, last - subs x20, x9, x6 - cneg x20, x20, cc // cc = lo, ul, last - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc // cc = lo, ul, last - cmn x19, #0x1 - eor x21, x21, x19 - adcs x13, x13, x21 - eor x20, x20, x19 - adcs x14, x14, x20 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x3, x4 - cneg x22, x22, cc // cc = lo, ul, last - csetm x19, cc // cc = lo, ul, last - subs x20, x8, x7 - cneg x20, x20, cc // cc = lo, ul, last - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc // cc = lo, ul, last - cmn x19, #0x1 - eor x21, x21, x19 - adcs x13, x13, x21 - eor x20, x20, x19 - adcs x14, x14, x20 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - adds x10, x10, x10 - adcs x11, x11, x11 - adcs x12, x12, x12 - adcs x13, x13, x13 - adcs x14, x14, x14 - adcs x15, x15, x15 - adcs x16, x16, x16 - adcs x17, x17, x17 - adc x19, xzr, xzr - -// Add it back to the buffer - - ldp x2, x3, [x0, #32] - adds x10, x10, x2 - adcs x11, x11, x3 - stp x10, x11, [x0, #32] - - ldp x2, x3, [x0, #48] - adcs x12, x12, x2 - adcs x13, x13, x3 - stp x12, x13, [x0, #48] - - ldp x2, x3, [x0, #64] - adcs x14, x14, x2 - adcs x15, x15, x3 - stp x14, x15, [x0, #64] - - ldp x2, x3, [x0, #80] - adcs x16, x16, x2 - adcs x17, x17, x3 - stp x16, x17, [x0, #80] - - ldp x2, x3, [x0, #96] - adcs x2, x2, x19 - adcs x3, x3, xzr - stp x2, x3, [x0, #96] - - ldp x2, x3, [x0, #112] - adcs x2, x2, xzr - adc x3, x3, xzr - stp x2, x3, [x0, #112] - - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif diff --git a/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S b/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S deleted file mode 100644 index 1a5a7a0ffc4..00000000000 --- a/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S +++ /dev/null @@ -1,1303 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 -// Input x[4]; output z[4] -// -// extern void bignum_montinv_p256(uint64_t z[static 4],uint64_t x[static 4]); -// -// If the 4-digit input x is coprime to p_256, i.e. is not divisible -// by it, returns z < p_256 such that x * z == 2^512 (mod p_256). This -// is effectively "Montgomery inverse" because if we consider x and z as -// Montgomery forms of X and Z, i.e. x == 2^256 * X and z == 2^256 * Z -// (both mod p_256) then X * Z == 1 (mod p_256). That is, this function -// gives the analog of the modular inverse bignum_inv_p256 but with both -// input and output in the Montgomery domain. Note that x does not need -// to be reduced modulo p_256, but the output always is. If the input -// is divisible (i.e. is 0 or p_256), then there can be no solution to -// the congruence x * z == 2^512 (mod p_256), and z = 0 is returned. -// -// Standard ARM ABI: X0 = z, X1 = x -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_p256) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_p256) - - .text - .balign 4 - -// Size in bytes of a 64-bit word - -#define N 8 - -// Used for the return pointer - -#define res x20 - -// Loop counter and d = 2 * delta value for divstep - -#define i x21 -#define d x22 - -// Registers used for matrix element magnitudes and signs - -#define m00 x10 -#define m01 x11 -#define m10 x12 -#define m11 x13 -#define s00 x14 -#define s01 x15 -#define s10 x16 -#define s11 x17 - -// Initial carries for combinations - -#define car0 x9 -#define car1 x19 - -// Input and output, plain registers treated according to pattern - -#define reg0 x0, #0 -#define reg1 x1, #0 -#define reg2 x2, #0 -#define reg3 x3, #0 -#define reg4 x4, #0 - -#define x x1, #0 -#define z x0, #0 - -// Pointer-offset pairs for temporaries on stack - -#define f sp, #0 -#define g sp, #(6*N) -#define u sp, #(12*N) -#define v sp, #(16*N) - -// Total size to reserve on the stack - -#define NSPACE #(20*N) - -// Loading large constants - -#define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ - movk nn, n3, lsl #48 - -// --------------------------------------------------------------------------- -// Core signed almost-Montgomery reduction macro. Takes input in -// [d4;d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to -// the existing [d4;d3;d2;d1], and re-using d0 as a temporary internally -// as well as t0, t1, t2. This is almost-Montgomery, i.e. the result fits -// in 4 digits but is not necessarily strictly reduced mod p_256. -// --------------------------------------------------------------------------- - -#define amontred(d4,d3,d2,d1,d0, t2,t1,t0) \ -/* We only know the input is -2^316 < x < 2^316. To do traditional */ \ -/* unsigned Montgomery reduction, start by adding 2^61 * p_256. */ \ - mov t0, #0xe000000000000000; \ - adds d0, d0, t0; \ - sbcs d1, d1, xzr; \ - mov t1, #0x000000001fffffff; \ - adcs d2, d2, t1; \ - mov t2, #0x2000000000000000; \ - adcs d3, d3, t2; \ - mov t0, #0x1fffffffe0000000; \ - adc d4, d4, t0; \ -/* Let w = d0, the original word we use as offset; d0 gets recycled */ \ -/* First let [t2;t1] = 2^32 * w */ \ -/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0) */ \ - lsl t1, d0, #32; \ - subs t0, d0, t1; \ - lsr t2, d0, #32; \ - sbc d0, d0, t2; \ -/* Hence basic [d4;d3;d2;d1] += (2^256 - 2^224 + 2^192 + 2^96) * w */ \ - adds d1, d1, t1; \ - adcs d2, d2, t2; \ - adcs d3, d3, t0; \ - adcs d4, d4, d0; \ -/* Now capture top carry and subtract p_256 if set (almost-Montgomery) */ \ - mov t0, #0xffffffffffffffff; \ - mov t1, #0x00000000ffffffff; \ - mov t2, #0xffffffff00000001; \ - csel t0, t0, xzr, cs; \ - csel t1, t1, xzr, cs; \ - csel t2, t2, xzr, cs; \ - subs d1, d1, t0; \ - sbcs d2, d2, t1; \ - sbcs d3, d3, xzr; \ - sbc d4, d4, t2 - -// Very similar to a subroutine call to the s2n-bignum word_divstep59. -// But different in register usage and returning the final matrix in -// registers as follows -// -// [ m00 m01] -// [ m10 m11] - -#define divstep59() \ - and x4, x2, #0xfffff; \ - orr x4, x4, #0xfffffe0000000000; \ - and x5, x3, #0xfffff; \ - orr x5, x5, #0xc000000000000000; \ - tst x5, #0x1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - asr x5, x5, #1; \ - add x8, x4, #0x100, lsl #12; \ - sbfx x8, x8, #21, #21; \ - mov x11, #0x100000; \ - add x11, x11, x11, lsl #21; \ - add x9, x4, x11; \ - asr x9, x9, #42; \ - add x10, x5, #0x100, lsl #12; \ - sbfx x10, x10, #21, #21; \ - add x11, x5, x11; \ - asr x11, x11, #42; \ - mul x6, x8, x2; \ - mul x7, x9, x3; \ - mul x2, x10, x2; \ - mul x3, x11, x3; \ - add x4, x6, x7; \ - add x5, x2, x3; \ - asr x2, x4, #20; \ - asr x3, x5, #20; \ - and x4, x2, #0xfffff; \ - orr x4, x4, #0xfffffe0000000000; \ - and x5, x3, #0xfffff; \ - orr x5, x5, #0xc000000000000000; \ - tst x5, #0x1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - asr x5, x5, #1; \ - add x12, x4, #0x100, lsl #12; \ - sbfx x12, x12, #21, #21; \ - mov x15, #0x100000; \ - add x15, x15, x15, lsl #21; \ - add x13, x4, x15; \ - asr x13, x13, #42; \ - add x14, x5, #0x100, lsl #12; \ - sbfx x14, x14, #21, #21; \ - add x15, x5, x15; \ - asr x15, x15, #42; \ - mul x6, x12, x2; \ - mul x7, x13, x3; \ - mul x2, x14, x2; \ - mul x3, x15, x3; \ - add x4, x6, x7; \ - add x5, x2, x3; \ - asr x2, x4, #20; \ - asr x3, x5, #20; \ - and x4, x2, #0xfffff; \ - orr x4, x4, #0xfffffe0000000000; \ - and x5, x3, #0xfffff; \ - orr x5, x5, #0xc000000000000000; \ - tst x5, #0x1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - mul x2, x12, x8; \ - mul x3, x12, x9; \ - mul x6, x14, x8; \ - mul x7, x14, x9; \ - madd x8, x13, x10, x2; \ - madd x9, x13, x11, x3; \ - madd x16, x15, x10, x6; \ - madd x17, x15, x11, x7; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - asr x5, x5, #1; \ - add x12, x4, #0x100, lsl #12; \ - sbfx x12, x12, #22, #21; \ - mov x15, #0x100000; \ - add x15, x15, x15, lsl #21; \ - add x13, x4, x15; \ - asr x13, x13, #43; \ - add x14, x5, #0x100, lsl #12; \ - sbfx x14, x14, #22, #21; \ - add x15, x5, x15; \ - asr x15, x15, #43; \ - mneg x2, x12, x8; \ - mneg x3, x12, x9; \ - mneg x4, x14, x8; \ - mneg x5, x14, x9; \ - msub m00, x13, x16, x2; \ - msub m01, x13, x17, x3; \ - msub m10, x15, x16, x4; \ - msub m11, x15, x17, x5 - -S2N_BN_SYMBOL(bignum_montinv_p256): - -// Save registers and make room for temporaries - - stp x19, x20, [sp, -16]! - stp x21, x22, [sp, -16]! - stp x23, x24, [sp, -16]! - sub sp, sp, NSPACE - -// Save the return pointer for the end so we can overwrite x0 later - - mov res, x0 - -// Copy the prime and input into the main f and g variables respectively. -// Make sure x is reduced so that g <= f as assumed in the bound proof. - - mov x10, #0xffffffffffffffff - mov x11, #0x00000000ffffffff - mov x13, #0xffffffff00000001 - stp x10, x11, [f] - stp xzr, x13, [f+2*N] - str xzr, [f+4*N] - - ldp x2, x3, [x1] - subs x10, x2, x10 - sbcs x11, x3, x11 - ldp x4, x5, [x1, #(2*N)] - sbcs x12, x4, xzr - sbcs x13, x5, x13 - - csel x2, x2, x10, cc - csel x3, x3, x11, cc - csel x4, x4, x12, cc - csel x5, x5, x13, cc - - stp x2, x3, [g] - stp x4, x5, [g+2*N] - str xzr, [g+4*N] - -// Also maintain reduced < 2^256 vector [u,v] such that -// [f,g] == x * 2^{5*i-562} * [u,v] (mod p_256) -// starting with [p_256,x] == x * 2^{5*0-562} * [0,2^562] (mod p_256) -// The weird-looking 5*i modifications come in because we are doing -// 64-bit word-sized Montgomery reductions at each stage, which is -// 5 bits more than the 59-bit requirement to keep things stable. -// After the 10th and last iteration and sign adjustment, when -// f == 1 for in-scope cases, we have x * 2^{50-562} * u == 1, i.e. -// x * u == 2^512 as required. - - stp xzr, xzr, [u] - stp xzr, xzr, [u+2*N] - -// The starting constant 2^562 mod p_256 is -// 0x000bffffffebffff:fffbffffffefffff:ffe8000000000000:000c000000140000 -// where colons separate 64-bit subwords, least significant at the right. -// Only word number 1, value 0xffe8000000000000, is a single ARM move. - - mov x10, #0x0000000000140000 - orr x10, x10, #0x000c000000000000 - - mov x11, #0xffe8000000000000 - - movbig(x13, #0x000b, #0xffff, #0xffef, #0xffff) - orr x12, x13, #0xfff0000000000000 - and x13, x13, #0xfffffffffffbffff - - stp x10, x11, [v] - stp x12, x13, [v+2*N] - -// Start of main loop. We jump into the middle so that the divstep -// portion is common to the special tenth iteration after a uniform -// first 9. - - mov i, #10 - mov d, #1 - b bignum_montinv_p256_midloop - -bignum_montinv_p256_loop: - -// Separate the matrix elements into sign-magnitude pairs - - cmp m00, xzr - csetm s00, mi - cneg m00, m00, mi - - cmp m01, xzr - csetm s01, mi - cneg m01, m01, mi - - cmp m10, xzr - csetm s10, mi - cneg m10, m10, mi - - cmp m11, xzr - csetm s11, mi - cneg m11, m11, mi - -// Adjust the initial values to allow for complement instead of negation -// This initial offset is the same for [f,g] and [u,v] compositions. -// Save it in stable registers for the [u,v] part and do [f,g] first. - - and x0, m00, s00 - and x1, m01, s01 - add car0, x0, x1 - - and x0, m10, s10 - and x1, m11, s11 - add car1, x0, x1 - -// Now the computation of the updated f and g values. This maintains a -// 2-word carry between stages so we can conveniently insert the shift -// right by 59 before storing back, and not overwrite digits we need -// again of the old f and g values. -// -// Digit 0 of [f,g] - - ldr x7, [f] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, car0, x0 - adc x2, xzr, x1 - ldr x8, [g] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - adc x2, x2, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, car1, x0 - adc x3, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - adc x3, x3, x1 - -// Digit 1 of [f,g] - - ldr x7, [f+N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [g+N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - adc x6, x6, x1 - extr x4, x2, x4, #59 - str x4, [f] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x3, x3, x0 - adc x4, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x3, x3, x0 - adc x4, x4, x1 - extr x5, x3, x5, #59 - str x5, [g] - -// Digit 2 of [f,g] - - ldr x7, [f+2*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [g+2*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - adc x5, x5, x1 - extr x2, x6, x2, #59 - str x2, [f+N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x4, x4, x0 - adc x2, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x4, x4, x0 - adc x2, x2, x1 - extr x3, x4, x3, #59 - str x3, [g+N] - -// Digits 3 and 4 of [f,g] - - ldr x7, [f+3*N] - eor x1, x7, s00 - ldr x23, [f+4*N] - eor x3, x23, s00 - and x3, x3, m00 - neg x3, x3 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, x3, x1 - ldr x8, [g+3*N] - eor x1, x8, s01 - ldr x24, [g+4*N] - eor x0, x24, s01 - and x0, x0, m01 - sub x3, x3, x0 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - adc x3, x3, x1 - extr x6, x5, x6, #59 - str x6, [f+2*N] - extr x5, x3, x5, #59 - str x5, [f+3*N] - asr x3, x3, #59 - str x3, [f+4*N] - - eor x1, x7, s10 - eor x5, x23, s10 - and x5, x5, m10 - neg x5, x5 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x2, x2, x0 - adc x5, x5, x1 - eor x1, x8, s11 - eor x0, x24, s11 - and x0, x0, m11 - sub x5, x5, x0 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x2, x2, x0 - adc x5, x5, x1 - extr x4, x2, x4, #59 - str x4, [g+2*N] - extr x2, x5, x2, #59 - str x2, [g+3*N] - asr x5, x5, #59 - str x5, [g+4*N] - -// Now the computation of the updated u and v values and their -// Montgomery reductions. A very similar accumulation except that -// the top words of u and v are unsigned and we don't shift. -// -// Digit 0 of [u,v] - - ldr x7, [u] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, car0, x0 - adc x2, xzr, x1 - ldr x8, [v] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u] - adc x2, x2, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, car1, x0 - adc x3, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - str x5, [v] - adc x3, x3, x1 - -// Digit 1 of [u,v] - - ldr x7, [u+N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [v+N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - str x2, [u+N] - adc x6, x6, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x3, x3, x0 - adc x4, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x3, x3, x0 - str x3, [v+N] - adc x4, x4, x1 - -// Digit 2 of [u,v] - - ldr x7, [u+2*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [v+2*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - str x6, [u+2*N] - adc x5, x5, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x4, x4, x0 - adc x2, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x4, x4, x0 - str x4, [v+2*N] - adc x2, x2, x1 - -// Digits 3 and 4 of u (top is unsigned) - - ldr x7, [u+3*N] - eor x1, x7, s00 - and x3, s00, m00 - neg x3, x3 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, x3, x1 - ldr x8, [v+3*N] - eor x1, x8, s01 - and x0, s01, m01 - sub x3, x3, x0 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - adc x3, x3, x1 - -// Montgomery reduction of u - - ldp x0, x1, [u] - ldr x6, [u+2*N] - amontred(x3,x5,x6,x1,x0, x10,x11,x14) - stp x1, x6, [u] - stp x5, x3, [u+16] - -// Digits 3 and 4 of v (top is unsigned) - - eor x1, x7, s10 - and x5, s10, m10 - neg x5, x5 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x2, x2, x0 - adc x5, x5, x1 - eor x1, x8, s11 - and x0, s11, m11 - sub x5, x5, x0 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x2, x2, x0 - adc x5, x5, x1 - -// Montgomery reduction of v - - ldp x0, x1, [v] - ldr x3, [v+2*N] - amontred(x5,x2,x3,x1,x0, x10,x11,x14) - stp x1, x3, [v] - stp x2, x5, [v+16] - -bignum_montinv_p256_midloop: - - mov x1, d - ldr x2, [f] - ldr x3, [g] - divstep59() - mov d, x1 - -// Next iteration - - subs i, i, #1 - bne bignum_montinv_p256_loop - -// The 10th and last iteration does not need anything except the -// u value and the sign of f; the latter can be obtained from the -// lowest word of f. So it's done differently from the main loop. -// Find the sign of the new f. For this we just need one digit -// since we know (for in-scope cases) that f is either +1 or -1. -// We don't explicitly shift right by 59 either, but looking at -// bit 63 (or any bit >= 60) of the unshifted result is enough -// to distinguish -1 from +1; this is then made into a mask. - - ldr x0, [f] - ldr x1, [g] - mul x0, x0, m00 - madd x1, x1, m01, x0 - asr x0, x1, #63 - -// Now separate out the matrix into sign-magnitude pairs -// and adjust each one based on the sign of f. -// -// Note that at this point we expect |f|=1 and we got its -// sign above, so then since [f,0] == x * 2^{-512} [u,v] (mod p_256) -// we want to flip the sign of u according to that of f. - - cmp m00, xzr - csetm s00, mi - cneg m00, m00, mi - eor s00, s00, x0 - - cmp m01, xzr - csetm s01, mi - cneg m01, m01, mi - eor s01, s01, x0 - - cmp m10, xzr - csetm s10, mi - cneg m10, m10, mi - eor s10, s10, x0 - - cmp m11, xzr - csetm s11, mi - cneg m11, m11, mi - eor s11, s11, x0 - -// Adjust the initial value to allow for complement instead of negation - - and x0, m00, s00 - and x1, m01, s01 - add car0, x0, x1 - -// Digit 0 of [u] - - ldr x7, [u] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, car0, x0 - adc x2, xzr, x1 - ldr x8, [v] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u] - adc x2, x2, x1 - -// Digit 1 of [u] - - ldr x7, [u+N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [v+N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - str x2, [u+N] - adc x6, x6, x1 - -// Digit 2 of [u] - - ldr x7, [u+2*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [v+2*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - str x6, [u+2*N] - adc x5, x5, x1 - -// Digits 3 and 4 of u (top is unsigned) - - ldr x7, [u+3*N] - eor x1, x7, s00 - and x3, s00, m00 - neg x3, x3 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, x3, x1 - ldr x8, [v+3*N] - eor x1, x8, s01 - and x0, s01, m01 - sub x3, x3, x0 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - adc x3, x3, x1 - -// Montgomery reduction of u. This needs to be strict not "almost" -// so it is followed by an optional subtraction of p_256 - - ldp x0, x1, [u] - ldr x2, [u+2*N] - amontred(x3,x5,x2,x1,x0, x10,x11,x14) - - mov x10, #0xffffffffffffffff - subs x10, x1, x10 - mov x11, #0x00000000ffffffff - sbcs x11, x2, x11 - mov x13, #0xffffffff00000001 - sbcs x12, x5, xzr - sbcs x13, x3, x13 - - csel x10, x1, x10, cc - csel x11, x2, x11, cc - csel x12, x5, x12, cc - csel x13, x3, x13, cc - -// Store it back to the final output - - stp x10, x11, [res] - stp x12, x13, [res, #16] - -// Restore stack and registers - - add sp, sp, NSPACE - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits -#endif diff --git a/third_party/s2n-bignum/arm/p384/bignum_inv_p384.S b/third_party/s2n-bignum/arm/p384/bignum_inv_p384.S deleted file mode 100644 index 085224172ea..00000000000 --- a/third_party/s2n-bignum/arm/p384/bignum_inv_p384.S +++ /dev/null @@ -1,1469 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 -// Input x[6]; output z[6] -// -// extern void bignum_inv_p384(uint64_t z[static 6],uint64_t x[static 6]); -// -// If the 6-digit input x is coprime to p_384, i.e. is not divisible -// by it, returns z < p_384 such that x * z == 1 (mod p_384). Note that -// x does not need to be reduced modulo p_384, but the output always is. -// If the input is divisible (i.e. is 0 or p_384), then there can be no -// modular inverse and z = 0 is returned. -// -// Standard ARM ABI: X0 = z, X1 = x -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p384) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p384) - - .text - .balign 4 - -// Size in bytes of a 64-bit word - -#define N 8 - -// Used for the return pointer - -#define res x20 - -// Loop counter and d = 2 * delta value for divstep - -#define i x21 -#define d x22 - -// Registers used for matrix element magnitudes and signs - -#define m00 x10 -#define m01 x11 -#define m10 x12 -#define m11 x13 -#define s00 x14 -#define s01 x15 -#define s10 x16 -#define s11 x17 - -// Initial carries for combinations - -#define car0 x9 -#define car1 x19 - -// Input and output, plain registers treated according to pattern - -#define reg0 x0, #0 -#define reg1 x1, #0 -#define reg2 x2, #0 -#define reg3 x3, #0 -#define reg4 x4, #0 - -#define x x1, #0 -#define z x0, #0 - -// Pointer-offset pairs for temporaries on stack -// The u and v variables are 6 words each as expected, but the f and g -// variables are 8 words each -- they need to have at least one extra -// word for a sign word, and to preserve alignment we "round up" to 8. -// In fact, we currently keep an extra word in u and v as well. - -#define f sp, #0 -#define g sp, #(8*N) -#define u sp, #(16*N) -#define v sp, #(24*N) - -// Total size to reserve on the stack - -#define NSPACE #(32*N) - -// --------------------------------------------------------------------------- -// Core signed almost-Montgomery reduction macro. Takes input in -// [d6;d5;d4;d3;d2;d1;d0] and returns result in [d6;d5d4;d3;d2;d1], adding -// to the existing [d6;d5;d4;d3;d2;d1], and re-using d0 as a temporary -// internally as well as t0, t1, t2. This is almost-Montgomery, i.e. the -// result fits in 6 digits but is not necessarily strictly reduced mod p_384. -// --------------------------------------------------------------------------- - -#define amontred(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ -/* We only know the input is -2^444 < x < 2^444. To do traditional */ \ -/* unsigned Montgomery reduction, start by adding 2^61 * p_384. */ \ - mov t1, #0xe000000000000000; \ - adds d0, d0, t1; \ - mov t2, #0x000000001fffffff; \ - adcs d1, d1, t2; \ - mov t3, #0xffffffffe0000000; \ - bic t3, t3, #0x2000000000000000; \ - adcs d2, d2, t3; \ - sbcs d3, d3, xzr; \ - sbcs d4, d4, xzr; \ - sbcs d5, d5, xzr; \ - mov t1, #0x1fffffffffffffff; \ - adc d6, d6, t1; \ -/* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ -/* Store it back into d0 since we no longer need that digit. */ \ - add d0, d0, d0, lsl #32; \ -/* Now let [t3;t2;t1;-] = (2^384 - p_384) * w */ \ -/* We know the lowest word will cancel d0 so we don't need it */ \ - mov t1, #0xffffffff00000001; \ - umulh t1, t1, d0; \ - mov t2, #0x00000000ffffffff; \ - mul t3, t2, d0; \ - umulh t2, t2, d0; \ - adds t1, t1, t3; \ - adcs t2, t2, d0; \ - cset t3, cs; \ -/* Now x + p_384 * w = (x + 2^384 * w) - (2^384 - p_384) * w */ \ -/* We catch the net top carry from add-subtract in the digit d0 */ \ - adds d6, d6, d0; \ - cset d0, cs; \ - subs d1, d1, t1; \ - sbcs d2, d2, t2; \ - sbcs d3, d3, t3; \ - sbcs d4, d4, xzr; \ - sbcs d5, d5, xzr; \ - sbcs d6, d6, xzr; \ - sbcs d0, d0, xzr; \ -/* Now if d0 is nonzero we subtract p_384 (almost-Montgomery) */ \ - neg d0, d0; \ - and t1, d0, #0x00000000ffffffff; \ - and t2, d0, #0xffffffff00000000; \ - and t3, d0, #0xfffffffffffffffe; \ - subs d1, d1, t1; \ - sbcs d2, d2, t2; \ - sbcs d3, d3, t3; \ - sbcs d4, d4, d0; \ - sbcs d5, d5, d0; \ - sbc d6, d6, d0 - -// Very similar to a subroutine call to the s2n-bignum word_divstep59. -// But different in register usage and returning the final matrix in -// registers as follows -// -// [ m00 m01] -// [ m10 m11] - -#define divstep59() \ - and x4, x2, #0xfffff; \ - orr x4, x4, #0xfffffe0000000000; \ - and x5, x3, #0xfffff; \ - orr x5, x5, #0xc000000000000000; \ - tst x5, #0x1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - asr x5, x5, #1; \ - add x8, x4, #0x100, lsl #12; \ - sbfx x8, x8, #21, #21; \ - mov x11, #0x100000; \ - add x11, x11, x11, lsl #21; \ - add x9, x4, x11; \ - asr x9, x9, #42; \ - add x10, x5, #0x100, lsl #12; \ - sbfx x10, x10, #21, #21; \ - add x11, x5, x11; \ - asr x11, x11, #42; \ - mul x6, x8, x2; \ - mul x7, x9, x3; \ - mul x2, x10, x2; \ - mul x3, x11, x3; \ - add x4, x6, x7; \ - add x5, x2, x3; \ - asr x2, x4, #20; \ - asr x3, x5, #20; \ - and x4, x2, #0xfffff; \ - orr x4, x4, #0xfffffe0000000000; \ - and x5, x3, #0xfffff; \ - orr x5, x5, #0xc000000000000000; \ - tst x5, #0x1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - asr x5, x5, #1; \ - add x12, x4, #0x100, lsl #12; \ - sbfx x12, x12, #21, #21; \ - mov x15, #0x100000; \ - add x15, x15, x15, lsl #21; \ - add x13, x4, x15; \ - asr x13, x13, #42; \ - add x14, x5, #0x100, lsl #12; \ - sbfx x14, x14, #21, #21; \ - add x15, x5, x15; \ - asr x15, x15, #42; \ - mul x6, x12, x2; \ - mul x7, x13, x3; \ - mul x2, x14, x2; \ - mul x3, x15, x3; \ - add x4, x6, x7; \ - add x5, x2, x3; \ - asr x2, x4, #20; \ - asr x3, x5, #20; \ - and x4, x2, #0xfffff; \ - orr x4, x4, #0xfffffe0000000000; \ - and x5, x3, #0xfffff; \ - orr x5, x5, #0xc000000000000000; \ - tst x5, #0x1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - mul x2, x12, x8; \ - mul x3, x12, x9; \ - mul x6, x14, x8; \ - mul x7, x14, x9; \ - madd x8, x13, x10, x2; \ - madd x9, x13, x11, x3; \ - madd x16, x15, x10, x6; \ - madd x17, x15, x11, x7; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - asr x5, x5, #1; \ - add x12, x4, #0x100, lsl #12; \ - sbfx x12, x12, #22, #21; \ - mov x15, #0x100000; \ - add x15, x15, x15, lsl #21; \ - add x13, x4, x15; \ - asr x13, x13, #43; \ - add x14, x5, #0x100, lsl #12; \ - sbfx x14, x14, #22, #21; \ - add x15, x5, x15; \ - asr x15, x15, #43; \ - mneg x2, x12, x8; \ - mneg x3, x12, x9; \ - mneg x4, x14, x8; \ - mneg x5, x14, x9; \ - msub m00, x13, x16, x2; \ - msub m01, x13, x17, x3; \ - msub m10, x15, x16, x4; \ - msub m11, x15, x17, x5 - -S2N_BN_SYMBOL(bignum_inv_p384): - -// Save registers and make room for temporaries - - stp x19, x20, [sp, -16]! - stp x21, x22, [sp, -16]! - stp x23, x24, [sp, -16]! - sub sp, sp, NSPACE - -// Save the return pointer for the end so we can overwrite x0 later - - mov res, x0 - -// Copy the prime and input into the main f and g variables respectively. -// Make sure x is reduced so that g <= f as assumed in the bound proof. - - mov x10, #0x00000000ffffffff - mov x11, #0xffffffff00000000 - mov x12, #0xfffffffffffffffe - mov x15, #0xffffffffffffffff - stp x10, x11, [f] - stp x12, x15, [f+2*N] - stp x15, x15, [f+4*N] - str xzr, [f+6*N] - - ldp x2, x3, [x1] - subs x10, x2, x10 - sbcs x11, x3, x11 - ldp x4, x5, [x1, #(2*N)] - sbcs x12, x4, x12 - sbcs x13, x5, x15 - ldp x6, x7, [x1, #(4*N)] - sbcs x14, x6, x15 - sbcs x15, x7, x15 - - csel x2, x2, x10, cc - csel x3, x3, x11, cc - csel x4, x4, x12, cc - csel x5, x5, x13, cc - csel x6, x6, x14, cc - csel x7, x7, x15, cc - - stp x2, x3, [g] - stp x4, x5, [g+2*N] - stp x6, x7, [g+4*N] - str xzr, [g+6*N] - -// Also maintain reduced < 2^384 vector [u,v] such that -// [f,g] == x * 2^{5*i-75} * [u,v] (mod p_384) -// starting with [p_384,x] == x * 2^{5*0-75} * [0,2^75] (mod p_384) -// The weird-looking 5*i modifications come in because we are doing -// 64-bit word-sized Montgomery reductions at each stage, which is -// 5 bits more than the 59-bit requirement to keep things stable. - - stp xzr, xzr, [u] - stp xzr, xzr, [u+2*N] - stp xzr, xzr, [u+4*N] - - mov x10, #2048 - stp xzr, x10, [v] - stp xzr, xzr, [v+2*N] - stp xzr, xzr, [v+4*N] - -// Start of main loop. We jump into the middle so that the divstep -// portion is common to the special fifteenth iteration after a uniform -// first 14. - - mov i, #15 - mov d, #1 - b midloop - -loop: - -// Separate the matrix elements into sign-magnitude pairs - - cmp m00, xzr - csetm s00, mi - cneg m00, m00, mi - - cmp m01, xzr - csetm s01, mi - cneg m01, m01, mi - - cmp m10, xzr - csetm s10, mi - cneg m10, m10, mi - - cmp m11, xzr - csetm s11, mi - cneg m11, m11, mi - -// Adjust the initial values to allow for complement instead of negation -// This initial offset is the same for [f,g] and [u,v] compositions. -// Save it in stable registers for the [u,v] part and do [f,g] first. - - and x0, m00, s00 - and x1, m01, s01 - add car0, x0, x1 - - and x0, m10, s10 - and x1, m11, s11 - add car1, x0, x1 - -// Now the computation of the updated f and g values. This maintains a -// 2-word carry between stages so we can conveniently insert the shift -// right by 59 before storing back, and not overwrite digits we need -// again of the old f and g values. -// -// Digit 0 of [f,g] - - ldr x7, [f] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, car0, x0 - adc x2, xzr, x1 - ldr x8, [g] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - adc x2, x2, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, car1, x0 - adc x3, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - adc x3, x3, x1 - -// Digit 1 of [f,g] - - ldr x7, [f+N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [g+N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - adc x6, x6, x1 - extr x4, x2, x4, #59 - str x4, [f] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x3, x3, x0 - adc x4, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x3, x3, x0 - adc x4, x4, x1 - extr x5, x3, x5, #59 - str x5, [g] - -// Digit 2 of [f,g] - - ldr x7, [f+2*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [g+2*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - adc x5, x5, x1 - extr x2, x6, x2, #59 - str x2, [f+N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x4, x4, x0 - adc x2, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x4, x4, x0 - adc x2, x2, x1 - extr x3, x4, x3, #59 - str x3, [g+N] - -// Digit 3 of [f,g] - - ldr x7, [f+3*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, xzr, x1 - ldr x8, [g+3*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - adc x3, x3, x1 - extr x6, x5, x6, #59 - str x6, [f+2*N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x2, x2, x0 - adc x6, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x2, x2, x0 - adc x6, x6, x1 - extr x4, x2, x4, #59 - str x4, [g+2*N] - -// Digit 4 of [f,g] - - ldr x7, [f+4*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x3, x3, x0 - adc x4, xzr, x1 - ldr x8, [g+4*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x3, x3, x0 - adc x4, x4, x1 - extr x5, x3, x5, #59 - str x5, [f+3*N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x6, x6, x0 - adc x5, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x6, x6, x0 - adc x5, x5, x1 - extr x2, x6, x2, #59 - str x2, [g+3*N] - -// Digits 5 and 6 of [f,g] - - ldr x7, [f+5*N] - eor x1, x7, s00 - ldr x23, [f+6*N] - eor x2, x23, s00 - and x2, x2, m00 - neg x2, x2 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, x4, x0 - adc x2, x2, x1 - ldr x8, [g+5*N] - eor x1, x8, s01 - ldr x24, [g+6*N] - eor x0, x24, s01 - and x0, x0, m01 - sub x2, x2, x0 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - adc x2, x2, x1 - extr x3, x4, x3, #59 - str x3, [f+4*N] - extr x4, x2, x4, #59 - str x4, [f+5*N] - asr x2, x2, #59 - str x2, [f+6*N] - - eor x1, x7, s10 - eor x4, x23, s10 - and x4, x4, m10 - neg x4, x4 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, x5, x0 - adc x4, x4, x1 - eor x1, x8, s11 - eor x0, x24, s11 - and x0, x0, m11 - sub x4, x4, x0 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - adc x4, x4, x1 - extr x6, x5, x6, #59 - str x6, [g+4*N] - extr x5, x4, x5, #59 - str x5, [g+5*N] - asr x4, x4, #59 - str x4, [g+6*N] - -// Now the computation of the updated u and v values and their -// Montgomery reductions. A very similar accumulation except that -// the top words of u and v are unsigned and we don't shift. -// -// Digit 0 of [u,v] - - ldr x7, [u] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, car0, x0 - adc x2, xzr, x1 - ldr x8, [v] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u] - adc x2, x2, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, car1, x0 - adc x3, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - str x5, [v] - adc x3, x3, x1 - -// Digit 1 of [u,v] - - ldr x7, [u+N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [v+N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - str x2, [u+N] - adc x6, x6, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x3, x3, x0 - adc x4, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x3, x3, x0 - str x3, [v+N] - adc x4, x4, x1 - -// Digit 2 of [u,v] - - ldr x7, [u+2*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [v+2*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - str x6, [u+2*N] - adc x5, x5, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x4, x4, x0 - adc x2, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x4, x4, x0 - str x4, [v+2*N] - adc x2, x2, x1 - -// Digit 3 of [u,v] - - ldr x7, [u+3*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, xzr, x1 - ldr x8, [v+3*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - str x5, [u+3*N] - adc x3, x3, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x2, x2, x0 - adc x6, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x2, x2, x0 - str x2, [v+3*N] - adc x6, x6, x1 - -// Digit 4 of [u,v] - - ldr x7, [u+4*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x3, x3, x0 - adc x4, xzr, x1 - ldr x8, [v+4*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x3, x3, x0 - str x3, [u+4*N] - adc x4, x4, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x6, x6, x0 - adc x5, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x6, x6, x0 - str x6, [v+4*N] - adc x5, x5, x1 - -// Digits 5 and 6 of [u,v] (top is unsigned) - - ldr x7, [u+5*N] - eor x1, x7, s00 - and x2, s00, m00 - neg x2, x2 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, x4, x0 - adc x2, x2, x1 - ldr x8, [v+5*N] - eor x1, x8, s01 - and x0, s01, m01 - sub x2, x2, x0 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u+5*N] - adc x2, x2, x1 - str x2, [u+6*N] - - eor x1, x7, s10 - and x4, s10, m10 - neg x4, x4 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, x5, x0 - adc x4, x4, x1 - eor x1, x8, s11 - and x0, s11, m11 - sub x4, x4, x0 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - str x5, [v+5*N] - adc x4, x4, x1 - str x4, [v+6*N] - -// Montgomery reduction of u - - ldp x0, x1, [u] - ldp x2, x3, [u+16] - ldp x4, x5, [u+32] - ldr x6, [u+48] - amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7) - stp x1, x2, [u] - stp x3, x4, [u+16] - stp x5, x6, [u+32] - -// Montgomery reduction of v - - ldp x0, x1, [v] - ldp x2, x3, [v+16] - ldp x4, x5, [v+32] - ldr x6, [v+48] - amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7) - stp x1, x2, [v] - stp x3, x4, [v+16] - stp x5, x6, [v+32] - -midloop: - - mov x1, d - ldr x2, [f] - ldr x3, [g] - divstep59() - mov d, x1 - -// Next iteration - - subs i, i, #1 - bne loop - -// The 15th and last iteration does not need anything except the -// u value and the sign of f; the latter can be obtained from the -// lowest word of f. So it's done differently from the main loop. -// Find the sign of the new f. For this we just need one digit -// since we know (for in-scope cases) that f is either +1 or -1. -// We don't explicitly shift right by 59 either, but looking at -// bit 63 (or any bit >= 60) of the unshifted result is enough -// to distinguish -1 from +1; this is then made into a mask. - - ldr x0, [f] - ldr x1, [g] - mul x0, x0, m00 - madd x1, x1, m01, x0 - asr x0, x1, #63 - -// Now separate out the matrix into sign-magnitude pairs -// and adjust each one based on the sign of f. -// -// Note that at this point we expect |f|=1 and we got its -// sign above, so then since [f,0] == x * [u,v] (mod p_384) -// we want to flip the sign of u according to that of f. - - cmp m00, xzr - csetm s00, mi - cneg m00, m00, mi - eor s00, s00, x0 - - cmp m01, xzr - csetm s01, mi - cneg m01, m01, mi - eor s01, s01, x0 - - cmp m10, xzr - csetm s10, mi - cneg m10, m10, mi - eor s10, s10, x0 - - cmp m11, xzr - csetm s11, mi - cneg m11, m11, mi - eor s11, s11, x0 - -// Adjust the initial value to allow for complement instead of negation - - and x0, m00, s00 - and x1, m01, s01 - add car0, x0, x1 - -// Digit 0 of [u] - - ldr x7, [u] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, car0, x0 - adc x2, xzr, x1 - ldr x8, [v] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u] - adc x2, x2, x1 - -// Digit 1 of [u] - - ldr x7, [u+N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [v+N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - str x2, [u+N] - adc x6, x6, x1 - -// Digit 2 of [u] - - ldr x7, [u+2*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [v+2*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - str x6, [u+2*N] - adc x5, x5, x1 - -// Digit 3 of [u] - - ldr x7, [u+3*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, xzr, x1 - ldr x8, [v+3*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - str x5, [u+3*N] - adc x3, x3, x1 - -// Digit 4 of [u] - - ldr x7, [u+4*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x3, x3, x0 - adc x4, xzr, x1 - ldr x8, [v+4*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x3, x3, x0 - str x3, [u+4*N] - adc x4, x4, x1 - -// Digits 5 and 6 of [u] (top is unsigned) - - ldr x7, [u+5*N] - eor x1, x7, s00 - and x2, s00, m00 - neg x2, x2 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, x4, x0 - adc x2, x2, x1 - ldr x8, [v+5*N] - eor x1, x8, s01 - and x0, s01, m01 - sub x2, x2, x0 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u+5*N] - adc x2, x2, x1 - str x2, [u+6*N] - -// Montgomery reduction of u. This needs to be strict not "almost" -// so it is followed by an optional subtraction of p_384 - - ldp x10, x0, [u] - ldp x1, x2, [u+16] - ldp x3, x4, [u+32] - ldr x5, [u+48] - amontred(x5,x4,x3,x2,x1,x0,x10, x9,x8,x7) - - mov x10, #0x00000000ffffffff - subs x10, x0, x10 - mov x11, #0xffffffff00000000 - sbcs x11, x1, x11 - mov x12, #0xfffffffffffffffe - sbcs x12, x2, x12 - mov x15, #0xffffffffffffffff - sbcs x13, x3, x15 - sbcs x14, x4, x15 - sbcs x15, x5, x15 - - csel x0, x0, x10, cc - csel x1, x1, x11, cc - csel x2, x2, x12, cc - csel x3, x3, x13, cc - csel x4, x4, x14, cc - csel x5, x5, x15, cc - -// Store it back to the final output - - stp x0, x1, [res] - stp x2, x3, [res, #16] - stp x4, x5, [res, #32] - -// Restore stack and registers - - add sp, sp, NSPACE - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits -#endif diff --git a/third_party/s2n-bignum/arm/p384/bignum_montinv_p384.S b/third_party/s2n-bignum/arm/p384/bignum_montinv_p384.S deleted file mode 100644 index 79d59781196..00000000000 --- a/third_party/s2n-bignum/arm/p384/bignum_montinv_p384.S +++ /dev/null @@ -1,1487 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 -// Input x[6]; output z[6] -// -// extern void bignum_montinv_p384(uint64_t z[static 6],uint64_t x[static 6]); -// -// If the 6-digit input x is coprime to p_384, i.e. is not divisible -// by it, returns z < p_384 such that x * z == 2^768 (mod p_384). This -// is effectively "Montgomery inverse" because if we consider x and z as -// Montgomery forms of X and Z, i.e. x == 2^384 * X and z == 2^384 * Z -// (both mod p_384) then X * Z == 1 (mod p_384). That is, this function -// gives the analog of the modular inverse bignum_inv_p384 but with both -// input and output in the Montgomery domain. Note that x does not need -// to be reduced modulo p_384, but the output always is. If the input -// is divisible (i.e. is 0 or p_384), then there can be no solution to -// the congruence x * z == 2^768 (mod p_384), and z = 0 is returned. -// -// Standard ARM ABI: X0 = z, X1 = x -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_p384) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_p384) - - .text - .balign 4 - -// Size in bytes of a 64-bit word - -#define N 8 - -// Used for the return pointer - -#define res x20 - -// Loop counter and d = 2 * delta value for divstep - -#define i x21 -#define d x22 - -// Registers used for matrix element magnitudes and signs - -#define m00 x10 -#define m01 x11 -#define m10 x12 -#define m11 x13 -#define s00 x14 -#define s01 x15 -#define s10 x16 -#define s11 x17 - -// Initial carries for combinations - -#define car0 x9 -#define car1 x19 - -// Input and output, plain registers treated according to pattern - -#define reg0 x0, #0 -#define reg1 x1, #0 -#define reg2 x2, #0 -#define reg3 x3, #0 -#define reg4 x4, #0 - -#define x x1, #0 -#define z x0, #0 - -// Pointer-offset pairs for temporaries on stack -// The u and v variables are 6 words each as expected, but the f and g -// variables are 8 words each -- they need to have at least one extra -// word for a sign word, and to preserve alignment we "round up" to 8. -// In fact, we currently keep an extra word in u and v as well. - -#define f sp, #0 -#define g sp, #(8*N) -#define u sp, #(16*N) -#define v sp, #(24*N) - -// Total size to reserve on the stack - -#define NSPACE #(32*N) - -// --------------------------------------------------------------------------- -// Core signed almost-Montgomery reduction macro. Takes input in -// [d6;d5;d4;d3;d2;d1;d0] and returns result in [d6;d5d4;d3;d2;d1], adding -// to the existing [d6;d5;d4;d3;d2;d1], and re-using d0 as a temporary -// internally as well as t0, t1, t2. This is almost-Montgomery, i.e. the -// result fits in 6 digits but is not necessarily strictly reduced mod p_384. -// --------------------------------------------------------------------------- - -#define amontred(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ -/* We only know the input is -2^444 < x < 2^444. To do traditional */ \ -/* unsigned Montgomery reduction, start by adding 2^61 * p_384. */ \ - mov t1, #0xe000000000000000; \ - adds d0, d0, t1; \ - mov t2, #0x000000001fffffff; \ - adcs d1, d1, t2; \ - mov t3, #0xffffffffe0000000; \ - bic t3, t3, #0x2000000000000000; \ - adcs d2, d2, t3; \ - sbcs d3, d3, xzr; \ - sbcs d4, d4, xzr; \ - sbcs d5, d5, xzr; \ - mov t1, #0x1fffffffffffffff; \ - adc d6, d6, t1; \ -/* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ -/* Store it back into d0 since we no longer need that digit. */ \ - add d0, d0, d0, lsl #32; \ -/* Now let [t3;t2;t1;-] = (2^384 - p_384) * w */ \ -/* We know the lowest word will cancel d0 so we don't need it */ \ - mov t1, #0xffffffff00000001; \ - umulh t1, t1, d0; \ - mov t2, #0x00000000ffffffff; \ - mul t3, t2, d0; \ - umulh t2, t2, d0; \ - adds t1, t1, t3; \ - adcs t2, t2, d0; \ - cset t3, cs; \ -/* Now x + p_384 * w = (x + 2^384 * w) - (2^384 - p_384) * w */ \ -/* We catch the net top carry from add-subtract in the digit d0 */ \ - adds d6, d6, d0; \ - cset d0, cs; \ - subs d1, d1, t1; \ - sbcs d2, d2, t2; \ - sbcs d3, d3, t3; \ - sbcs d4, d4, xzr; \ - sbcs d5, d5, xzr; \ - sbcs d6, d6, xzr; \ - sbcs d0, d0, xzr; \ -/* Now if d0 is nonzero we subtract p_384 (almost-Montgomery) */ \ - neg d0, d0; \ - and t1, d0, #0x00000000ffffffff; \ - and t2, d0, #0xffffffff00000000; \ - and t3, d0, #0xfffffffffffffffe; \ - subs d1, d1, t1; \ - sbcs d2, d2, t2; \ - sbcs d3, d3, t3; \ - sbcs d4, d4, d0; \ - sbcs d5, d5, d0; \ - sbc d6, d6, d0 - -// Very similar to a subroutine call to the s2n-bignum word_divstep59. -// But different in register usage and returning the final matrix in -// registers as follows -// -// [ m00 m01] -// [ m10 m11] - -#define divstep59() \ - and x4, x2, #0xfffff; \ - orr x4, x4, #0xfffffe0000000000; \ - and x5, x3, #0xfffff; \ - orr x5, x5, #0xc000000000000000; \ - tst x5, #0x1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - asr x5, x5, #1; \ - add x8, x4, #0x100, lsl #12; \ - sbfx x8, x8, #21, #21; \ - mov x11, #0x100000; \ - add x11, x11, x11, lsl #21; \ - add x9, x4, x11; \ - asr x9, x9, #42; \ - add x10, x5, #0x100, lsl #12; \ - sbfx x10, x10, #21, #21; \ - add x11, x5, x11; \ - asr x11, x11, #42; \ - mul x6, x8, x2; \ - mul x7, x9, x3; \ - mul x2, x10, x2; \ - mul x3, x11, x3; \ - add x4, x6, x7; \ - add x5, x2, x3; \ - asr x2, x4, #20; \ - asr x3, x5, #20; \ - and x4, x2, #0xfffff; \ - orr x4, x4, #0xfffffe0000000000; \ - and x5, x3, #0xfffff; \ - orr x5, x5, #0xc000000000000000; \ - tst x5, #0x1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - asr x5, x5, #1; \ - add x12, x4, #0x100, lsl #12; \ - sbfx x12, x12, #21, #21; \ - mov x15, #0x100000; \ - add x15, x15, x15, lsl #21; \ - add x13, x4, x15; \ - asr x13, x13, #42; \ - add x14, x5, #0x100, lsl #12; \ - sbfx x14, x14, #21, #21; \ - add x15, x5, x15; \ - asr x15, x15, #42; \ - mul x6, x12, x2; \ - mul x7, x13, x3; \ - mul x2, x14, x2; \ - mul x3, x15, x3; \ - add x4, x6, x7; \ - add x5, x2, x3; \ - asr x2, x4, #20; \ - asr x3, x5, #20; \ - and x4, x2, #0xfffff; \ - orr x4, x4, #0xfffffe0000000000; \ - and x5, x3, #0xfffff; \ - orr x5, x5, #0xc000000000000000; \ - tst x5, #0x1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - mul x2, x12, x8; \ - mul x3, x12, x9; \ - mul x6, x14, x8; \ - mul x7, x14, x9; \ - madd x8, x13, x10, x2; \ - madd x9, x13, x11, x3; \ - madd x16, x15, x10, x6; \ - madd x17, x15, x11, x7; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - asr x5, x5, #1; \ - add x12, x4, #0x100, lsl #12; \ - sbfx x12, x12, #22, #21; \ - mov x15, #0x100000; \ - add x15, x15, x15, lsl #21; \ - add x13, x4, x15; \ - asr x13, x13, #43; \ - add x14, x5, #0x100, lsl #12; \ - sbfx x14, x14, #22, #21; \ - add x15, x5, x15; \ - asr x15, x15, #43; \ - mneg x2, x12, x8; \ - mneg x3, x12, x9; \ - mneg x4, x14, x8; \ - mneg x5, x14, x9; \ - msub m00, x13, x16, x2; \ - msub m01, x13, x17, x3; \ - msub m10, x15, x16, x4; \ - msub m11, x15, x17, x5 - -S2N_BN_SYMBOL(bignum_montinv_p384): - -// Save registers and make room for temporaries - - stp x19, x20, [sp, -16]! - stp x21, x22, [sp, -16]! - stp x23, x24, [sp, -16]! - sub sp, sp, NSPACE - -// Save the return pointer for the end so we can overwrite x0 later - - mov res, x0 - -// Copy the prime and input into the main f and g variables respectively. -// Make sure x is reduced so that g <= f as assumed in the bound proof. - - mov x10, #0x00000000ffffffff - mov x11, #0xffffffff00000000 - mov x12, #0xfffffffffffffffe - mov x15, #0xffffffffffffffff - stp x10, x11, [f] - stp x12, x15, [f+2*N] - stp x15, x15, [f+4*N] - str xzr, [f+6*N] - - ldp x2, x3, [x1] - subs x10, x2, x10 - sbcs x11, x3, x11 - ldp x4, x5, [x1, #(2*N)] - sbcs x12, x4, x12 - sbcs x13, x5, x15 - ldp x6, x7, [x1, #(4*N)] - sbcs x14, x6, x15 - sbcs x15, x7, x15 - - csel x2, x2, x10, cc - csel x3, x3, x11, cc - csel x4, x4, x12, cc - csel x5, x5, x13, cc - csel x6, x6, x14, cc - csel x7, x7, x15, cc - - stp x2, x3, [g] - stp x4, x5, [g+2*N] - stp x6, x7, [g+4*N] - str xzr, [g+6*N] - -// Also maintain reduced < 2^384 vector [u,v] such that -// [f,g] == x * 2^{5*i-843} * [u,v] (mod p_384) -// starting with [p_384,x] == x * 2^{5*0-843} * [0,2^843] (mod p_384) -// The weird-looking 5*i modifications come in because we are doing -// 64-bit word-sized Montgomery reductions at each stage, which is -// 5 bits more than the 59-bit requirement to keep things stable. -// After the 15th and last iteration and sign adjustment, when -// f == 1 for in-scope cases, we have x * 2^{75-843} * u == 1, i.e. -// x * u == 2^768 as required. - - stp xzr, xzr, [u] - stp xzr, xzr, [u+2*N] - stp xzr, xzr, [u+4*N] - -// The starting constant 2^843 mod p_384 is -// 0x0000000000000800:00001000000007ff:fffff00000000000 -// :00001000000007ff:fffff00000000800:0000000000000000 -// where colons separate 64-bit subwords, least significant at the right. -// Not all of these are single loads on ARM so this is a bit dynamic - - mov x12, #0xfffff00000000000 - orr x10, x12, #0x0000000000000800 - stp xzr, x10, [v] - mov x11, #0x00000000000007ff - orr x11, x11, #0x0000100000000000 - stp x11, x12, [v+2*N] - mov x12, #0x0000000000000800 - stp x11, x12, [v+4*N] - -// Start of main loop. We jump into the middle so that the divstep -// portion is common to the special fifteenth iteration after a uniform -// first 14. - - mov i, #15 - mov d, #1 - b bignum_montinv_p384_midloop - -bignum_montinv_p384_loop: - -// Separate the matrix elements into sign-magnitude pairs - - cmp m00, xzr - csetm s00, mi - cneg m00, m00, mi - - cmp m01, xzr - csetm s01, mi - cneg m01, m01, mi - - cmp m10, xzr - csetm s10, mi - cneg m10, m10, mi - - cmp m11, xzr - csetm s11, mi - cneg m11, m11, mi - -// Adjust the initial values to allow for complement instead of negation -// This initial offset is the same for [f,g] and [u,v] compositions. -// Save it in stable registers for the [u,v] part and do [f,g] first. - - and x0, m00, s00 - and x1, m01, s01 - add car0, x0, x1 - - and x0, m10, s10 - and x1, m11, s11 - add car1, x0, x1 - -// Now the computation of the updated f and g values. This maintains a -// 2-word carry between stages so we can conveniently insert the shift -// right by 59 before storing back, and not overwrite digits we need -// again of the old f and g values. -// -// Digit 0 of [f,g] - - ldr x7, [f] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, car0, x0 - adc x2, xzr, x1 - ldr x8, [g] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - adc x2, x2, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, car1, x0 - adc x3, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - adc x3, x3, x1 - -// Digit 1 of [f,g] - - ldr x7, [f+N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [g+N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - adc x6, x6, x1 - extr x4, x2, x4, #59 - str x4, [f] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x3, x3, x0 - adc x4, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x3, x3, x0 - adc x4, x4, x1 - extr x5, x3, x5, #59 - str x5, [g] - -// Digit 2 of [f,g] - - ldr x7, [f+2*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [g+2*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - adc x5, x5, x1 - extr x2, x6, x2, #59 - str x2, [f+N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x4, x4, x0 - adc x2, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x4, x4, x0 - adc x2, x2, x1 - extr x3, x4, x3, #59 - str x3, [g+N] - -// Digit 3 of [f,g] - - ldr x7, [f+3*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, xzr, x1 - ldr x8, [g+3*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - adc x3, x3, x1 - extr x6, x5, x6, #59 - str x6, [f+2*N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x2, x2, x0 - adc x6, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x2, x2, x0 - adc x6, x6, x1 - extr x4, x2, x4, #59 - str x4, [g+2*N] - -// Digit 4 of [f,g] - - ldr x7, [f+4*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x3, x3, x0 - adc x4, xzr, x1 - ldr x8, [g+4*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x3, x3, x0 - adc x4, x4, x1 - extr x5, x3, x5, #59 - str x5, [f+3*N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x6, x6, x0 - adc x5, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x6, x6, x0 - adc x5, x5, x1 - extr x2, x6, x2, #59 - str x2, [g+3*N] - -// Digits 5 and 6 of [f,g] - - ldr x7, [f+5*N] - eor x1, x7, s00 - ldr x23, [f+6*N] - eor x2, x23, s00 - and x2, x2, m00 - neg x2, x2 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, x4, x0 - adc x2, x2, x1 - ldr x8, [g+5*N] - eor x1, x8, s01 - ldr x24, [g+6*N] - eor x0, x24, s01 - and x0, x0, m01 - sub x2, x2, x0 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - adc x2, x2, x1 - extr x3, x4, x3, #59 - str x3, [f+4*N] - extr x4, x2, x4, #59 - str x4, [f+5*N] - asr x2, x2, #59 - str x2, [f+6*N] - - eor x1, x7, s10 - eor x4, x23, s10 - and x4, x4, m10 - neg x4, x4 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, x5, x0 - adc x4, x4, x1 - eor x1, x8, s11 - eor x0, x24, s11 - and x0, x0, m11 - sub x4, x4, x0 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - adc x4, x4, x1 - extr x6, x5, x6, #59 - str x6, [g+4*N] - extr x5, x4, x5, #59 - str x5, [g+5*N] - asr x4, x4, #59 - str x4, [g+6*N] - -// Now the computation of the updated u and v values and their -// Montgomery reductions. A very similar accumulation except that -// the top words of u and v are unsigned and we don't shift. -// -// Digit 0 of [u,v] - - ldr x7, [u] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, car0, x0 - adc x2, xzr, x1 - ldr x8, [v] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u] - adc x2, x2, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, car1, x0 - adc x3, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - str x5, [v] - adc x3, x3, x1 - -// Digit 1 of [u,v] - - ldr x7, [u+N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [v+N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - str x2, [u+N] - adc x6, x6, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x3, x3, x0 - adc x4, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x3, x3, x0 - str x3, [v+N] - adc x4, x4, x1 - -// Digit 2 of [u,v] - - ldr x7, [u+2*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [v+2*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - str x6, [u+2*N] - adc x5, x5, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x4, x4, x0 - adc x2, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x4, x4, x0 - str x4, [v+2*N] - adc x2, x2, x1 - -// Digit 3 of [u,v] - - ldr x7, [u+3*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, xzr, x1 - ldr x8, [v+3*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - str x5, [u+3*N] - adc x3, x3, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x2, x2, x0 - adc x6, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x2, x2, x0 - str x2, [v+3*N] - adc x6, x6, x1 - -// Digit 4 of [u,v] - - ldr x7, [u+4*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x3, x3, x0 - adc x4, xzr, x1 - ldr x8, [v+4*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x3, x3, x0 - str x3, [u+4*N] - adc x4, x4, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x6, x6, x0 - adc x5, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x6, x6, x0 - str x6, [v+4*N] - adc x5, x5, x1 - -// Digits 5 and 6 of [u,v] (top is unsigned) - - ldr x7, [u+5*N] - eor x1, x7, s00 - and x2, s00, m00 - neg x2, x2 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, x4, x0 - adc x2, x2, x1 - ldr x8, [v+5*N] - eor x1, x8, s01 - and x0, s01, m01 - sub x2, x2, x0 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u+5*N] - adc x2, x2, x1 - str x2, [u+6*N] - - eor x1, x7, s10 - and x4, s10, m10 - neg x4, x4 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, x5, x0 - adc x4, x4, x1 - eor x1, x8, s11 - and x0, s11, m11 - sub x4, x4, x0 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - str x5, [v+5*N] - adc x4, x4, x1 - str x4, [v+6*N] - -// Montgomery reduction of u - - ldp x0, x1, [u] - ldp x2, x3, [u+16] - ldp x4, x5, [u+32] - ldr x6, [u+48] - amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7) - stp x1, x2, [u] - stp x3, x4, [u+16] - stp x5, x6, [u+32] - -// Montgomery reduction of v - - ldp x0, x1, [v] - ldp x2, x3, [v+16] - ldp x4, x5, [v+32] - ldr x6, [v+48] - amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7) - stp x1, x2, [v] - stp x3, x4, [v+16] - stp x5, x6, [v+32] - -bignum_montinv_p384_midloop: - - mov x1, d - ldr x2, [f] - ldr x3, [g] - divstep59() - mov d, x1 - -// Next iteration - - subs i, i, #1 - bne bignum_montinv_p384_loop - -// The 15th and last iteration does not need anything except the -// u value and the sign of f; the latter can be obtained from the -// lowest word of f. So it's done differently from the main loop. -// Find the sign of the new f. For this we just need one digit -// since we know (for in-scope cases) that f is either +1 or -1. -// We don't explicitly shift right by 59 either, but looking at -// bit 63 (or any bit >= 60) of the unshifted result is enough -// to distinguish -1 from +1; this is then made into a mask. - - ldr x0, [f] - ldr x1, [g] - mul x0, x0, m00 - madd x1, x1, m01, x0 - asr x0, x1, #63 - -// Now separate out the matrix into sign-magnitude pairs -// and adjust each one based on the sign of f. -// -// Note that at this point we expect |f|=1 and we got its -// sign above, so then since [f,0] == x * 2^{-768} [u,v] (mod p_384) -// we want to flip the sign of u according to that of f. - - cmp m00, xzr - csetm s00, mi - cneg m00, m00, mi - eor s00, s00, x0 - - cmp m01, xzr - csetm s01, mi - cneg m01, m01, mi - eor s01, s01, x0 - - cmp m10, xzr - csetm s10, mi - cneg m10, m10, mi - eor s10, s10, x0 - - cmp m11, xzr - csetm s11, mi - cneg m11, m11, mi - eor s11, s11, x0 - -// Adjust the initial value to allow for complement instead of negation - - and x0, m00, s00 - and x1, m01, s01 - add car0, x0, x1 - -// Digit 0 of [u] - - ldr x7, [u] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, car0, x0 - adc x2, xzr, x1 - ldr x8, [v] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u] - adc x2, x2, x1 - -// Digit 1 of [u] - - ldr x7, [u+N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [v+N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - str x2, [u+N] - adc x6, x6, x1 - -// Digit 2 of [u] - - ldr x7, [u+2*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [v+2*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - str x6, [u+2*N] - adc x5, x5, x1 - -// Digit 3 of [u] - - ldr x7, [u+3*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, xzr, x1 - ldr x8, [v+3*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - str x5, [u+3*N] - adc x3, x3, x1 - -// Digit 4 of [u] - - ldr x7, [u+4*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x3, x3, x0 - adc x4, xzr, x1 - ldr x8, [v+4*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x3, x3, x0 - str x3, [u+4*N] - adc x4, x4, x1 - -// Digits 5 and 6 of [u] (top is unsigned) - - ldr x7, [u+5*N] - eor x1, x7, s00 - and x2, s00, m00 - neg x2, x2 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, x4, x0 - adc x2, x2, x1 - ldr x8, [v+5*N] - eor x1, x8, s01 - and x0, s01, m01 - sub x2, x2, x0 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u+5*N] - adc x2, x2, x1 - str x2, [u+6*N] - -// Montgomery reduction of u. This needs to be strict not "almost" -// so it is followed by an optional subtraction of p_384 - - ldp x10, x0, [u] - ldp x1, x2, [u+16] - ldp x3, x4, [u+32] - ldr x5, [u+48] - amontred(x5,x4,x3,x2,x1,x0,x10, x9,x8,x7) - - mov x10, #0x00000000ffffffff - subs x10, x0, x10 - mov x11, #0xffffffff00000000 - sbcs x11, x1, x11 - mov x12, #0xfffffffffffffffe - sbcs x12, x2, x12 - mov x15, #0xffffffffffffffff - sbcs x13, x3, x15 - sbcs x14, x4, x15 - sbcs x15, x5, x15 - - csel x0, x0, x10, cc - csel x1, x1, x11, cc - csel x2, x2, x12, cc - csel x3, x3, x13, cc - csel x4, x4, x14, cc - csel x5, x5, x15, cc - -// Store it back to the final output - - stp x0, x1, [res] - stp x2, x3, [res, #16] - stp x4, x5, [res, #32] - -// Restore stack and registers - - add sp, sp, NSPACE - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits -#endif diff --git a/third_party/s2n-bignum/arm/p384/p384_montjadd_alt.S b/third_party/s2n-bignum/arm/p384/p384_montjadd_alt.S deleted file mode 100644 index b84065dea97..00000000000 --- a/third_party/s2n-bignum/arm/p384/p384_montjadd_alt.S +++ /dev/null @@ -1,993 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates -// -// extern void p384_montjadd_alt -// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]); -// -// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with -// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. -// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). -// -// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjadd_alt) - S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjadd_alt) - .text - .balign 4 - -// Size of individual field elements - -#define NUMSIZE 48 - -// Stable homes for input arguments during main code sequence - -#define input_z x24 -#define input_x x25 -#define input_y x26 - -// Pointer-offset pairs for inputs and outputs - -#define x_1 input_x, #0 -#define y_1 input_x, #NUMSIZE -#define z_1 input_x, #(2*NUMSIZE) - -#define x_2 input_y, #0 -#define y_2 input_y, #NUMSIZE -#define z_2 input_y, #(2*NUMSIZE) - -#define x_3 input_z, #0 -#define y_3 input_z, #NUMSIZE -#define z_3 input_z, #(2*NUMSIZE) - -// Pointer-offset pairs for temporaries, with some aliasing -// NSPACE is the total stack needed for these temporaries - -#define z1sq sp, #(NUMSIZE*0) -#define ww sp, #(NUMSIZE*0) -#define resx sp, #(NUMSIZE*0) - -#define yd sp, #(NUMSIZE*1) -#define y2a sp, #(NUMSIZE*1) - -#define x2a sp, #(NUMSIZE*2) -#define zzx2 sp, #(NUMSIZE*2) - -#define zz sp, #(NUMSIZE*3) -#define t1 sp, #(NUMSIZE*3) - -#define t2 sp, #(NUMSIZE*4) -#define x1a sp, #(NUMSIZE*4) -#define zzx1 sp, #(NUMSIZE*4) -#define resy sp, #(NUMSIZE*4) - -#define xd sp, #(NUMSIZE*5) -#define z2sq sp, #(NUMSIZE*5) -#define resz sp, #(NUMSIZE*5) - -#define y1a sp, #(NUMSIZE*6) - -#define NSPACE (NUMSIZE*7) - -// Corresponds exactly to bignum_montmul_p384_alt - -#define montmul_p384(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x12, x3, x5; \ - umulh x13, x3, x5; \ - mul x11, x3, x6; \ - umulh x14, x3, x6; \ - adds x13, x13, x11; \ - ldp x7, x8, [P2+16]; \ - mul x11, x3, x7; \ - umulh x15, x3, x7; \ - adcs x14, x14, x11; \ - mul x11, x3, x8; \ - umulh x16, x3, x8; \ - adcs x15, x15, x11; \ - ldp x9, x10, [P2+32]; \ - mul x11, x3, x9; \ - umulh x17, x3, x9; \ - adcs x16, x16, x11; \ - mul x11, x3, x10; \ - umulh x19, x3, x10; \ - adcs x17, x17, x11; \ - adc x19, x19, xzr; \ - mul x11, x4, x5; \ - adds x13, x13, x11; \ - mul x11, x4, x6; \ - adcs x14, x14, x11; \ - mul x11, x4, x7; \ - adcs x15, x15, x11; \ - mul x11, x4, x8; \ - adcs x16, x16, x11; \ - mul x11, x4, x9; \ - adcs x17, x17, x11; \ - mul x11, x4, x10; \ - adcs x19, x19, x11; \ - cset x20, cs; \ - umulh x11, x4, x5; \ - adds x14, x14, x11; \ - umulh x11, x4, x6; \ - adcs x15, x15, x11; \ - umulh x11, x4, x7; \ - adcs x16, x16, x11; \ - umulh x11, x4, x8; \ - adcs x17, x17, x11; \ - umulh x11, x4, x9; \ - adcs x19, x19, x11; \ - umulh x11, x4, x10; \ - adc x20, x20, x11; \ - ldp x3, x4, [P1+16]; \ - mul x11, x3, x5; \ - adds x14, x14, x11; \ - mul x11, x3, x6; \ - adcs x15, x15, x11; \ - mul x11, x3, x7; \ - adcs x16, x16, x11; \ - mul x11, x3, x8; \ - adcs x17, x17, x11; \ - mul x11, x3, x9; \ - adcs x19, x19, x11; \ - mul x11, x3, x10; \ - adcs x20, x20, x11; \ - cset x21, cs; \ - umulh x11, x3, x5; \ - adds x15, x15, x11; \ - umulh x11, x3, x6; \ - adcs x16, x16, x11; \ - umulh x11, x3, x7; \ - adcs x17, x17, x11; \ - umulh x11, x3, x8; \ - adcs x19, x19, x11; \ - umulh x11, x3, x9; \ - adcs x20, x20, x11; \ - umulh x11, x3, x10; \ - adc x21, x21, x11; \ - mul x11, x4, x5; \ - adds x15, x15, x11; \ - mul x11, x4, x6; \ - adcs x16, x16, x11; \ - mul x11, x4, x7; \ - adcs x17, x17, x11; \ - mul x11, x4, x8; \ - adcs x19, x19, x11; \ - mul x11, x4, x9; \ - adcs x20, x20, x11; \ - mul x11, x4, x10; \ - adcs x21, x21, x11; \ - cset x22, cs; \ - umulh x11, x4, x5; \ - adds x16, x16, x11; \ - umulh x11, x4, x6; \ - adcs x17, x17, x11; \ - umulh x11, x4, x7; \ - adcs x19, x19, x11; \ - umulh x11, x4, x8; \ - adcs x20, x20, x11; \ - umulh x11, x4, x9; \ - adcs x21, x21, x11; \ - umulh x11, x4, x10; \ - adc x22, x22, x11; \ - ldp x3, x4, [P1+32]; \ - mul x11, x3, x5; \ - adds x16, x16, x11; \ - mul x11, x3, x6; \ - adcs x17, x17, x11; \ - mul x11, x3, x7; \ - adcs x19, x19, x11; \ - mul x11, x3, x8; \ - adcs x20, x20, x11; \ - mul x11, x3, x9; \ - adcs x21, x21, x11; \ - mul x11, x3, x10; \ - adcs x22, x22, x11; \ - cset x2, cs; \ - umulh x11, x3, x5; \ - adds x17, x17, x11; \ - umulh x11, x3, x6; \ - adcs x19, x19, x11; \ - umulh x11, x3, x7; \ - adcs x20, x20, x11; \ - umulh x11, x3, x8; \ - adcs x21, x21, x11; \ - umulh x11, x3, x9; \ - adcs x22, x22, x11; \ - umulh x11, x3, x10; \ - adc x2, x2, x11; \ - mul x11, x4, x5; \ - adds x17, x17, x11; \ - mul x11, x4, x6; \ - adcs x19, x19, x11; \ - mul x11, x4, x7; \ - adcs x20, x20, x11; \ - mul x11, x4, x8; \ - adcs x21, x21, x11; \ - mul x11, x4, x9; \ - adcs x22, x22, x11; \ - mul x11, x4, x10; \ - adcs x2, x2, x11; \ - cset x1, cs; \ - umulh x11, x4, x5; \ - adds x19, x19, x11; \ - umulh x11, x4, x6; \ - adcs x20, x20, x11; \ - umulh x11, x4, x7; \ - adcs x21, x21, x11; \ - umulh x11, x4, x8; \ - adcs x22, x22, x11; \ - umulh x11, x4, x9; \ - adcs x2, x2, x11; \ - umulh x11, x4, x10; \ - adc x1, x1, x11; \ - lsl x7, x12, #32; \ - add x12, x7, x12; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x12; \ - mov x6, #0xffffffff; \ - mul x5, x6, x12; \ - umulh x6, x6, x12; \ - adds x7, x7, x5; \ - adcs x6, x6, x12; \ - adc x5, xzr, xzr; \ - subs x13, x13, x7; \ - sbcs x14, x14, x6; \ - sbcs x15, x15, x5; \ - sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x12, x12, xzr; \ - lsl x7, x13, #32; \ - add x13, x7, x13; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x13; \ - mov x6, #0xffffffff; \ - mul x5, x6, x13; \ - umulh x6, x6, x13; \ - adds x7, x7, x5; \ - adcs x6, x6, x13; \ - adc x5, xzr, xzr; \ - subs x14, x14, x7; \ - sbcs x15, x15, x6; \ - sbcs x16, x16, x5; \ - sbcs x17, x17, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - lsl x7, x14, #32; \ - add x14, x7, x14; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x14; \ - mov x6, #0xffffffff; \ - mul x5, x6, x14; \ - umulh x6, x6, x14; \ - adds x7, x7, x5; \ - adcs x6, x6, x14; \ - adc x5, xzr, xzr; \ - subs x15, x15, x7; \ - sbcs x16, x16, x6; \ - sbcs x17, x17, x5; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x14, x14, xzr; \ - lsl x7, x15, #32; \ - add x15, x7, x15; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x15; \ - mov x6, #0xffffffff; \ - mul x5, x6, x15; \ - umulh x6, x6, x15; \ - adds x7, x7, x5; \ - adcs x6, x6, x15; \ - adc x5, xzr, xzr; \ - subs x16, x16, x7; \ - sbcs x17, x17, x6; \ - sbcs x12, x12, x5; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbc x15, x15, xzr; \ - lsl x7, x16, #32; \ - add x16, x7, x16; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x16; \ - mov x6, #0xffffffff; \ - mul x5, x6, x16; \ - umulh x6, x6, x16; \ - adds x7, x7, x5; \ - adcs x6, x6, x16; \ - adc x5, xzr, xzr; \ - subs x17, x17, x7; \ - sbcs x12, x12, x6; \ - sbcs x13, x13, x5; \ - sbcs x14, x14, xzr; \ - sbcs x15, x15, xzr; \ - sbc x16, x16, xzr; \ - lsl x7, x17, #32; \ - add x17, x7, x17; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x17; \ - mov x6, #0xffffffff; \ - mul x5, x6, x17; \ - umulh x6, x6, x17; \ - adds x7, x7, x5; \ - adcs x6, x6, x17; \ - adc x5, xzr, xzr; \ - subs x12, x12, x7; \ - sbcs x13, x13, x6; \ - sbcs x14, x14, x5; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbc x17, x17, xzr; \ - adds x12, x12, x19; \ - adcs x13, x13, x20; \ - adcs x14, x14, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x2; \ - adcs x17, x17, x1; \ - adc x10, xzr, xzr; \ - mov x11, #0xffffffff00000001; \ - adds x19, x12, x11; \ - mov x11, #0xffffffff; \ - adcs x20, x13, x11; \ - mov x11, #0x1; \ - adcs x21, x14, x11; \ - adcs x22, x15, xzr; \ - adcs x2, x16, xzr; \ - adcs x1, x17, xzr; \ - adcs x10, x10, xzr; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - csel x14, x14, x21, eq; \ - csel x15, x15, x22, eq; \ - csel x16, x16, x2, eq; \ - csel x17, x17, x1, eq; \ - stp x12, x13, [P0]; \ - stp x14, x15, [P0+16]; \ - stp x16, x17, [P0+32] - -// Corresponds exactly to bignum_montsqr_p384_alt - -#define montsqr_p384(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ - adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ - ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ - adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ - sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - adc x6, xzr, xzr; \ - mov x8, #-4294967295; \ - adds x14, x2, x8; \ - mov x8, #4294967295; \ - adcs x15, x9, x8; \ - mov x8, #1; \ - adcs x16, x10, x8; \ - adcs x17, x11, xzr; \ - adcs x19, x12, xzr; \ - adcs x20, x13, xzr; \ - adcs x6, x6, xzr; \ - csel x2, x2, x14, eq; \ - csel x9, x9, x15, eq; \ - csel x10, x10, x16, eq; \ - csel x11, x11, x17, eq; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] - -// Almost-Montgomery variant which we use when an input to other muls -// with the other argument fully reduced (which is always safe). In -// fact, with the Karatsuba-based Montgomery mul here, we don't even -// *need* the restriction that the other argument is reduced. - -#define amontsqr_p384(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ - adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ - ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ - adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ - sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - mov x14, #-4294967295; \ - mov x15, #4294967295; \ - csel x14, x14, xzr, cs; \ - csel x15, x15, xzr, cs; \ - cset x16, cs; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, xzr; \ - adcs x12, x12, xzr; \ - adc x13, x13, xzr; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] - -// Corresponds exactly to bignum_sub_p384 - -#define sub_p384(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - csetm x3, lo; \ - mov x4, #4294967295; \ - and x4, x4, x3; \ - adds x5, x5, x4; \ - eor x4, x4, x3; \ - adcs x6, x6, x4; \ - mov x4, #-2; \ - and x4, x4, x3; \ - adcs x7, x7, x4; \ - adcs x8, x8, x3; \ - adcs x9, x9, x3; \ - adc x10, x10, x3; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32] - -S2N_BN_SYMBOL(p384_montjadd_alt): - -// Save regs and make room on stack for temporary variables - - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - stp x25, x26, [sp, #-16]! - sub sp, sp, NSPACE - -// Move the input arguments to stable places - - mov input_z, x0 - mov input_x, x1 - mov input_y, x2 - -// Main code, just a sequence of basic field operations -// 8 * multiply + 3 * square + 7 * subtract - - amontsqr_p384(z1sq,z_1) - amontsqr_p384(z2sq,z_2) - - montmul_p384(y1a,z_2,y_1) - montmul_p384(y2a,z_1,y_2) - - montmul_p384(x2a,z1sq,x_2) - montmul_p384(x1a,z2sq,x_1) - montmul_p384(y2a,z1sq,y2a) - montmul_p384(y1a,z2sq,y1a) - - sub_p384(xd,x2a,x1a) - sub_p384(yd,y2a,y1a) - - amontsqr_p384(zz,xd) - montsqr_p384(ww,yd) - - montmul_p384(zzx1,zz,x1a) - montmul_p384(zzx2,zz,x2a) - - sub_p384(resx,ww,zzx1) - sub_p384(t1,zzx2,zzx1) - - montmul_p384(xd,xd,z_1) - - sub_p384(resx,resx,zzx2) - - sub_p384(t2,zzx1,resx) - - montmul_p384(t1,t1,y1a) - montmul_p384(resz,xd,z_2) - montmul_p384(t2,yd,t2) - - sub_p384(resy,t2,t1) - -// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 -// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) -// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) -// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 - - ldp x0, x1, [z_1] - ldp x2, x3, [z_1+16] - ldp x4, x5, [z_1+32] - - orr x20, x0, x1 - orr x21, x2, x3 - orr x22, x4, x5 - orr x20, x20, x21 - orr x20, x20, x22 - cmp x20, xzr - cset x20, ne - - ldp x6, x7, [z_2] - ldp x8, x9, [z_2+16] - ldp x10, x11, [z_2+32] - - orr x21, x6, x7 - orr x22, x8, x9 - orr x23, x10, x11 - orr x21, x21, x22 - orr x21, x21, x23 - cmp x21, xzr - cset x21, ne - - cmp x21, x20 - -// Multiplex the outputs accordingly, re-using the z's in registers - - ldp x12, x13, [resz] - csel x12, x0, x12, lo - csel x13, x1, x13, lo - csel x12, x6, x12, hi - csel x13, x7, x13, hi - ldp x14, x15, [resz+16] - csel x14, x2, x14, lo - csel x15, x3, x15, lo - csel x14, x8, x14, hi - csel x15, x9, x15, hi - ldp x16, x17, [resz+32] - csel x16, x4, x16, lo - csel x17, x5, x17, lo - csel x16, x10, x16, hi - csel x17, x11, x17, hi - - ldp x20, x21, [x_1] - ldp x0, x1, [resx] - csel x0, x20, x0, lo - csel x1, x21, x1, lo - ldp x20, x21, [x_2] - csel x0, x20, x0, hi - csel x1, x21, x1, hi - - ldp x20, x21, [x_1+16] - ldp x2, x3, [resx+16] - csel x2, x20, x2, lo - csel x3, x21, x3, lo - ldp x20, x21, [x_2+16] - csel x2, x20, x2, hi - csel x3, x21, x3, hi - - ldp x20, x21, [x_1+32] - ldp x4, x5, [resx+32] - csel x4, x20, x4, lo - csel x5, x21, x5, lo - ldp x20, x21, [x_2+32] - csel x4, x20, x4, hi - csel x5, x21, x5, hi - - ldp x20, x21, [y_1] - ldp x6, x7, [resy] - csel x6, x20, x6, lo - csel x7, x21, x7, lo - ldp x20, x21, [y_2] - csel x6, x20, x6, hi - csel x7, x21, x7, hi - - ldp x20, x21, [y_1+16] - ldp x8, x9, [resy+16] - csel x8, x20, x8, lo - csel x9, x21, x9, lo - ldp x20, x21, [y_2+16] - csel x8, x20, x8, hi - csel x9, x21, x9, hi - - ldp x20, x21, [y_1+32] - ldp x10, x11, [resy+32] - csel x10, x20, x10, lo - csel x11, x21, x11, lo - ldp x20, x21, [y_2+32] - csel x10, x20, x10, hi - csel x11, x21, x11, hi - -// Finally store back the multiplexed values - - stp x0, x1, [x_3] - stp x2, x3, [x_3+16] - stp x4, x5, [x_3+32] - stp x6, x7, [y_3] - stp x8, x9, [y_3+16] - stp x10, x11, [y_3+32] - stp x12, x13, [z_3] - stp x14, x15, [z_3+16] - stp x16, x17, [z_3+32] - -// Restore stack and registers - - add sp, sp, NSPACE - - ldp x25, x26, [sp], 16 - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits -#endif diff --git a/third_party/s2n-bignum/arm/p384/p384_montjdouble_alt.S b/third_party/s2n-bignum/arm/p384/p384_montjdouble_alt.S deleted file mode 100644 index 0e83ff4a986..00000000000 --- a/third_party/s2n-bignum/arm/p384/p384_montjdouble_alt.S +++ /dev/null @@ -1,951 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates -// -// extern void p384_montjdouble_alt -// (uint64_t p3[static 18],uint64_t p1[static 18]); -// -// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with -// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. -// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). -// -// Standard ARM ABI: X0 = p3, X1 = p1 -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjdouble_alt) - S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjdouble_alt) - .text - .balign 4 - -// Size of individual field elements - -#define NUMSIZE 48 - -// Stable homes for input arguments during main code sequence - -#define input_z x23 -#define input_x x24 - -// Pointer-offset pairs for inputs and outputs - -#define x_1 input_x, #0 -#define y_1 input_x, #NUMSIZE -#define z_1 input_x, #(2*NUMSIZE) - -#define x_3 input_z, #0 -#define y_3 input_z, #NUMSIZE -#define z_3 input_z, #(2*NUMSIZE) - -// Pointer-offset pairs for temporaries, with some aliasing -// NSPACE is the total stack needed for these temporaries - -#define z2 sp, #(NUMSIZE*0) -#define y2 sp, #(NUMSIZE*1) -#define x2p sp, #(NUMSIZE*2) -#define xy2 sp, #(NUMSIZE*3) - -#define y4 sp, #(NUMSIZE*4) -#define t2 sp, #(NUMSIZE*4) - -#define dx2 sp, #(NUMSIZE*5) -#define t1 sp, #(NUMSIZE*5) - -#define d sp, #(NUMSIZE*6) -#define x4p sp, #(NUMSIZE*6) - -#define NSPACE (NUMSIZE*7) - -// Corresponds exactly to bignum_montmul_p384_alt - -#define montmul_p384(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x12, x3, x5; \ - umulh x13, x3, x5; \ - mul x11, x3, x6; \ - umulh x14, x3, x6; \ - adds x13, x13, x11; \ - ldp x7, x8, [P2+16]; \ - mul x11, x3, x7; \ - umulh x15, x3, x7; \ - adcs x14, x14, x11; \ - mul x11, x3, x8; \ - umulh x16, x3, x8; \ - adcs x15, x15, x11; \ - ldp x9, x10, [P2+32]; \ - mul x11, x3, x9; \ - umulh x17, x3, x9; \ - adcs x16, x16, x11; \ - mul x11, x3, x10; \ - umulh x19, x3, x10; \ - adcs x17, x17, x11; \ - adc x19, x19, xzr; \ - mul x11, x4, x5; \ - adds x13, x13, x11; \ - mul x11, x4, x6; \ - adcs x14, x14, x11; \ - mul x11, x4, x7; \ - adcs x15, x15, x11; \ - mul x11, x4, x8; \ - adcs x16, x16, x11; \ - mul x11, x4, x9; \ - adcs x17, x17, x11; \ - mul x11, x4, x10; \ - adcs x19, x19, x11; \ - cset x20, cs; \ - umulh x11, x4, x5; \ - adds x14, x14, x11; \ - umulh x11, x4, x6; \ - adcs x15, x15, x11; \ - umulh x11, x4, x7; \ - adcs x16, x16, x11; \ - umulh x11, x4, x8; \ - adcs x17, x17, x11; \ - umulh x11, x4, x9; \ - adcs x19, x19, x11; \ - umulh x11, x4, x10; \ - adc x20, x20, x11; \ - ldp x3, x4, [P1+16]; \ - mul x11, x3, x5; \ - adds x14, x14, x11; \ - mul x11, x3, x6; \ - adcs x15, x15, x11; \ - mul x11, x3, x7; \ - adcs x16, x16, x11; \ - mul x11, x3, x8; \ - adcs x17, x17, x11; \ - mul x11, x3, x9; \ - adcs x19, x19, x11; \ - mul x11, x3, x10; \ - adcs x20, x20, x11; \ - cset x21, cs; \ - umulh x11, x3, x5; \ - adds x15, x15, x11; \ - umulh x11, x3, x6; \ - adcs x16, x16, x11; \ - umulh x11, x3, x7; \ - adcs x17, x17, x11; \ - umulh x11, x3, x8; \ - adcs x19, x19, x11; \ - umulh x11, x3, x9; \ - adcs x20, x20, x11; \ - umulh x11, x3, x10; \ - adc x21, x21, x11; \ - mul x11, x4, x5; \ - adds x15, x15, x11; \ - mul x11, x4, x6; \ - adcs x16, x16, x11; \ - mul x11, x4, x7; \ - adcs x17, x17, x11; \ - mul x11, x4, x8; \ - adcs x19, x19, x11; \ - mul x11, x4, x9; \ - adcs x20, x20, x11; \ - mul x11, x4, x10; \ - adcs x21, x21, x11; \ - cset x22, cs; \ - umulh x11, x4, x5; \ - adds x16, x16, x11; \ - umulh x11, x4, x6; \ - adcs x17, x17, x11; \ - umulh x11, x4, x7; \ - adcs x19, x19, x11; \ - umulh x11, x4, x8; \ - adcs x20, x20, x11; \ - umulh x11, x4, x9; \ - adcs x21, x21, x11; \ - umulh x11, x4, x10; \ - adc x22, x22, x11; \ - ldp x3, x4, [P1+32]; \ - mul x11, x3, x5; \ - adds x16, x16, x11; \ - mul x11, x3, x6; \ - adcs x17, x17, x11; \ - mul x11, x3, x7; \ - adcs x19, x19, x11; \ - mul x11, x3, x8; \ - adcs x20, x20, x11; \ - mul x11, x3, x9; \ - adcs x21, x21, x11; \ - mul x11, x3, x10; \ - adcs x22, x22, x11; \ - cset x2, cs; \ - umulh x11, x3, x5; \ - adds x17, x17, x11; \ - umulh x11, x3, x6; \ - adcs x19, x19, x11; \ - umulh x11, x3, x7; \ - adcs x20, x20, x11; \ - umulh x11, x3, x8; \ - adcs x21, x21, x11; \ - umulh x11, x3, x9; \ - adcs x22, x22, x11; \ - umulh x11, x3, x10; \ - adc x2, x2, x11; \ - mul x11, x4, x5; \ - adds x17, x17, x11; \ - mul x11, x4, x6; \ - adcs x19, x19, x11; \ - mul x11, x4, x7; \ - adcs x20, x20, x11; \ - mul x11, x4, x8; \ - adcs x21, x21, x11; \ - mul x11, x4, x9; \ - adcs x22, x22, x11; \ - mul x11, x4, x10; \ - adcs x2, x2, x11; \ - cset x1, cs; \ - umulh x11, x4, x5; \ - adds x19, x19, x11; \ - umulh x11, x4, x6; \ - adcs x20, x20, x11; \ - umulh x11, x4, x7; \ - adcs x21, x21, x11; \ - umulh x11, x4, x8; \ - adcs x22, x22, x11; \ - umulh x11, x4, x9; \ - adcs x2, x2, x11; \ - umulh x11, x4, x10; \ - adc x1, x1, x11; \ - lsl x7, x12, #32; \ - add x12, x7, x12; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x12; \ - mov x6, #0xffffffff; \ - mul x5, x6, x12; \ - umulh x6, x6, x12; \ - adds x7, x7, x5; \ - adcs x6, x6, x12; \ - adc x5, xzr, xzr; \ - subs x13, x13, x7; \ - sbcs x14, x14, x6; \ - sbcs x15, x15, x5; \ - sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x12, x12, xzr; \ - lsl x7, x13, #32; \ - add x13, x7, x13; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x13; \ - mov x6, #0xffffffff; \ - mul x5, x6, x13; \ - umulh x6, x6, x13; \ - adds x7, x7, x5; \ - adcs x6, x6, x13; \ - adc x5, xzr, xzr; \ - subs x14, x14, x7; \ - sbcs x15, x15, x6; \ - sbcs x16, x16, x5; \ - sbcs x17, x17, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - lsl x7, x14, #32; \ - add x14, x7, x14; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x14; \ - mov x6, #0xffffffff; \ - mul x5, x6, x14; \ - umulh x6, x6, x14; \ - adds x7, x7, x5; \ - adcs x6, x6, x14; \ - adc x5, xzr, xzr; \ - subs x15, x15, x7; \ - sbcs x16, x16, x6; \ - sbcs x17, x17, x5; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x14, x14, xzr; \ - lsl x7, x15, #32; \ - add x15, x7, x15; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x15; \ - mov x6, #0xffffffff; \ - mul x5, x6, x15; \ - umulh x6, x6, x15; \ - adds x7, x7, x5; \ - adcs x6, x6, x15; \ - adc x5, xzr, xzr; \ - subs x16, x16, x7; \ - sbcs x17, x17, x6; \ - sbcs x12, x12, x5; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbc x15, x15, xzr; \ - lsl x7, x16, #32; \ - add x16, x7, x16; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x16; \ - mov x6, #0xffffffff; \ - mul x5, x6, x16; \ - umulh x6, x6, x16; \ - adds x7, x7, x5; \ - adcs x6, x6, x16; \ - adc x5, xzr, xzr; \ - subs x17, x17, x7; \ - sbcs x12, x12, x6; \ - sbcs x13, x13, x5; \ - sbcs x14, x14, xzr; \ - sbcs x15, x15, xzr; \ - sbc x16, x16, xzr; \ - lsl x7, x17, #32; \ - add x17, x7, x17; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x17; \ - mov x6, #0xffffffff; \ - mul x5, x6, x17; \ - umulh x6, x6, x17; \ - adds x7, x7, x5; \ - adcs x6, x6, x17; \ - adc x5, xzr, xzr; \ - subs x12, x12, x7; \ - sbcs x13, x13, x6; \ - sbcs x14, x14, x5; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbc x17, x17, xzr; \ - adds x12, x12, x19; \ - adcs x13, x13, x20; \ - adcs x14, x14, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x2; \ - adcs x17, x17, x1; \ - adc x10, xzr, xzr; \ - mov x11, #0xffffffff00000001; \ - adds x19, x12, x11; \ - mov x11, #0xffffffff; \ - adcs x20, x13, x11; \ - mov x11, #0x1; \ - adcs x21, x14, x11; \ - adcs x22, x15, xzr; \ - adcs x2, x16, xzr; \ - adcs x1, x17, xzr; \ - adcs x10, x10, xzr; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - csel x14, x14, x21, eq; \ - csel x15, x15, x22, eq; \ - csel x16, x16, x2, eq; \ - csel x17, x17, x1, eq; \ - stp x12, x13, [P0]; \ - stp x14, x15, [P0+16]; \ - stp x16, x17, [P0+32] - -// Corresponds exactly to bignum_montsqr_p384_alt - -#define montsqr_p384(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ - adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ - ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ - adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ - sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - adc x6, xzr, xzr; \ - mov x8, #-4294967295; \ - adds x14, x2, x8; \ - mov x8, #4294967295; \ - adcs x15, x9, x8; \ - mov x8, #1; \ - adcs x16, x10, x8; \ - adcs x17, x11, xzr; \ - adcs x19, x12, xzr; \ - adcs x20, x13, xzr; \ - adcs x6, x6, xzr; \ - csel x2, x2, x14, eq; \ - csel x9, x9, x15, eq; \ - csel x10, x10, x16, eq; \ - csel x11, x11, x17, eq; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] - -// Corresponds exactly to bignum_sub_p384 - -#define sub_p384(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - csetm x3, lo; \ - mov x4, #4294967295; \ - and x4, x4, x3; \ - adds x5, x5, x4; \ - eor x4, x4, x3; \ - adcs x6, x6, x4; \ - mov x4, #-2; \ - and x4, x4, x3; \ - adcs x7, x7, x4; \ - adcs x8, x8, x3; \ - adcs x9, x9, x3; \ - adc x10, x10, x3; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32] - -// Corresponds exactly to bignum_add_p384 - -#define add_p384(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - adds x5, x5, x4; \ - adcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - adcs x7, x7, x4; \ - adcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - adcs x9, x9, x4; \ - adcs x10, x10, x3; \ - adc x3, xzr, xzr; \ - mov x4, #0xffffffff; \ - cmp x5, x4; \ - mov x4, #0xffffffff00000000; \ - sbcs xzr, x6, x4; \ - mov x4, #0xfffffffffffffffe; \ - sbcs xzr, x7, x4; \ - adcs xzr, x8, xzr; \ - adcs xzr, x9, xzr; \ - adcs xzr, x10, xzr; \ - adcs x3, x3, xzr; \ - csetm x3, ne; \ - mov x4, #0xffffffff; \ - and x4, x4, x3; \ - subs x5, x5, x4; \ - eor x4, x4, x3; \ - sbcs x6, x6, x4; \ - mov x4, #0xfffffffffffffffe; \ - and x4, x4, x3; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - sbcs x9, x9, x3; \ - sbc x10, x10, x3; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32] - -// P0 = 4 * P1 - P2 - -#define cmsub41_p384(P0,P1,P2) \ - ldp x1, x2, [P1]; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P1+32]; \ - lsl x0, x1, #2; \ - ldp x7, x8, [P2]; \ - subs x0, x0, x7; \ - extr x1, x2, x1, #62; \ - sbcs x1, x1, x8; \ - ldp x7, x8, [P2+16]; \ - extr x2, x3, x2, #62; \ - sbcs x2, x2, x7; \ - extr x3, x4, x3, #62; \ - sbcs x3, x3, x8; \ - extr x4, x5, x4, #62; \ - ldp x7, x8, [P2+32]; \ - sbcs x4, x4, x7; \ - extr x5, x6, x5, #62; \ - sbcs x5, x5, x8; \ - lsr x6, x6, #62; \ - adc x6, x6, xzr; \ - lsl x7, x6, #32; \ - subs x8, x6, x7; \ - sbc x7, x7, xzr; \ - adds x0, x0, x8; \ - adcs x1, x1, x7; \ - adcs x2, x2, x6; \ - adcs x3, x3, xzr; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - csetm x8, cc; \ - mov x9, #0xffffffff; \ - and x9, x9, x8; \ - adds x0, x0, x9; \ - eor x9, x9, x8; \ - adcs x1, x1, x9; \ - mov x9, #0xfffffffffffffffe; \ - and x9, x9, x8; \ - adcs x2, x2, x9; \ - adcs x3, x3, x8; \ - adcs x4, x4, x8; \ - adc x5, x5, x8; \ - stp x0, x1, [P0]; \ - stp x2, x3, [P0+16]; \ - stp x4, x5, [P0+32] - -// P0 = C * P1 - D * P2 - -#define cmsub_p384(P0,C,P1,D,P2) \ - ldp x0, x1, [P2]; \ - mov x6, #0x00000000ffffffff; \ - subs x6, x6, x0; \ - mov x7, #0xffffffff00000000; \ - sbcs x7, x7, x1; \ - ldp x0, x1, [P2+16]; \ - mov x8, #0xfffffffffffffffe; \ - sbcs x8, x8, x0; \ - mov x13, #0xffffffffffffffff; \ - sbcs x9, x13, x1; \ - ldp x0, x1, [P2+32]; \ - sbcs x10, x13, x0; \ - sbc x11, x13, x1; \ - mov x12, D; \ - mul x0, x12, x6; \ - mul x1, x12, x7; \ - mul x2, x12, x8; \ - mul x3, x12, x9; \ - mul x4, x12, x10; \ - mul x5, x12, x11; \ - umulh x6, x12, x6; \ - umulh x7, x12, x7; \ - umulh x8, x12, x8; \ - umulh x9, x12, x9; \ - umulh x10, x12, x10; \ - umulh x12, x12, x11; \ - adds x1, x1, x6; \ - adcs x2, x2, x7; \ - adcs x3, x3, x8; \ - adcs x4, x4, x9; \ - adcs x5, x5, x10; \ - mov x6, #1; \ - adc x6, x12, x6; \ - ldp x8, x9, [P1]; \ - ldp x10, x11, [P1+16]; \ - ldp x12, x13, [P1+32]; \ - mov x14, C; \ - mul x15, x14, x8; \ - umulh x8, x14, x8; \ - adds x0, x0, x15; \ - mul x15, x14, x9; \ - umulh x9, x14, x9; \ - adcs x1, x1, x15; \ - mul x15, x14, x10; \ - umulh x10, x14, x10; \ - adcs x2, x2, x15; \ - mul x15, x14, x11; \ - umulh x11, x14, x11; \ - adcs x3, x3, x15; \ - mul x15, x14, x12; \ - umulh x12, x14, x12; \ - adcs x4, x4, x15; \ - mul x15, x14, x13; \ - umulh x13, x14, x13; \ - adcs x5, x5, x15; \ - adc x6, x6, xzr; \ - adds x1, x1, x8; \ - adcs x2, x2, x9; \ - adcs x3, x3, x10; \ - adcs x4, x4, x11; \ - adcs x5, x5, x12; \ - adcs x6, x6, x13; \ - lsl x7, x6, #32; \ - subs x8, x6, x7; \ - sbc x7, x7, xzr; \ - adds x0, x0, x8; \ - adcs x1, x1, x7; \ - adcs x2, x2, x6; \ - adcs x3, x3, xzr; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - csetm x6, cc; \ - mov x7, #0xffffffff; \ - and x7, x7, x6; \ - adds x0, x0, x7; \ - eor x7, x7, x6; \ - adcs x1, x1, x7; \ - mov x7, #0xfffffffffffffffe; \ - and x7, x7, x6; \ - adcs x2, x2, x7; \ - adcs x3, x3, x6; \ - adcs x4, x4, x6; \ - adc x5, x5, x6; \ - stp x0, x1, [P0]; \ - stp x2, x3, [P0+16]; \ - stp x4, x5, [P0+32] - -// A weak version of add that only guarantees sum in 6 digits - -#define weakadd_p384(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - adds x5, x5, x4; \ - adcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - adcs x7, x7, x4; \ - adcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - adcs x9, x9, x4; \ - adcs x10, x10, x3; \ - csetm x3, cs; \ - mov x4, #0xffffffff; \ - and x4, x4, x3; \ - subs x5, x5, x4; \ - eor x4, x4, x3; \ - sbcs x6, x6, x4; \ - mov x4, #0xfffffffffffffffe; \ - and x4, x4, x3; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - sbcs x9, x9, x3; \ - sbc x10, x10, x3; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32] - -// P0 = 3 * P1 - 8 * P2 - -#define cmsub38_p384(P0,P1,P2) \ - ldp x0, x1, [P2]; \ - mov x6, #0x00000000ffffffff; \ - subs x6, x6, x0; \ - mov x7, #0xffffffff00000000; \ - sbcs x7, x7, x1; \ - ldp x0, x1, [P2+16]; \ - mov x8, #0xfffffffffffffffe; \ - sbcs x8, x8, x0; \ - mov x13, #0xffffffffffffffff; \ - sbcs x9, x13, x1; \ - ldp x0, x1, [P2+32]; \ - sbcs x10, x13, x0; \ - sbc x11, x13, x1; \ - lsl x0, x6, #3; \ - extr x1, x7, x6, #61; \ - extr x2, x8, x7, #61; \ - extr x3, x9, x8, #61; \ - extr x4, x10, x9, #61; \ - extr x5, x11, x10, #61; \ - lsr x6, x11, #61; \ - add x6, x6, #1; \ - ldp x8, x9, [P1]; \ - ldp x10, x11, [P1+16]; \ - ldp x12, x13, [P1+32]; \ - mov x14, 3; \ - mul x15, x14, x8; \ - umulh x8, x14, x8; \ - adds x0, x0, x15; \ - mul x15, x14, x9; \ - umulh x9, x14, x9; \ - adcs x1, x1, x15; \ - mul x15, x14, x10; \ - umulh x10, x14, x10; \ - adcs x2, x2, x15; \ - mul x15, x14, x11; \ - umulh x11, x14, x11; \ - adcs x3, x3, x15; \ - mul x15, x14, x12; \ - umulh x12, x14, x12; \ - adcs x4, x4, x15; \ - mul x15, x14, x13; \ - umulh x13, x14, x13; \ - adcs x5, x5, x15; \ - adc x6, x6, xzr; \ - adds x1, x1, x8; \ - adcs x2, x2, x9; \ - adcs x3, x3, x10; \ - adcs x4, x4, x11; \ - adcs x5, x5, x12; \ - adcs x6, x6, x13; \ - lsl x7, x6, #32; \ - subs x8, x6, x7; \ - sbc x7, x7, xzr; \ - adds x0, x0, x8; \ - adcs x1, x1, x7; \ - adcs x2, x2, x6; \ - adcs x3, x3, xzr; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - csetm x6, cc; \ - mov x7, #0xffffffff; \ - and x7, x7, x6; \ - adds x0, x0, x7; \ - eor x7, x7, x6; \ - adcs x1, x1, x7; \ - mov x7, #0xfffffffffffffffe; \ - and x7, x7, x6; \ - adcs x2, x2, x7; \ - adcs x3, x3, x6; \ - adcs x4, x4, x6; \ - adc x5, x5, x6; \ - stp x0, x1, [P0]; \ - stp x2, x3, [P0+16]; \ - stp x4, x5, [P0+32] - -S2N_BN_SYMBOL(p384_montjdouble_alt): - -// Save regs and make room on stack for temporary variables - - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - sub sp, sp, NSPACE - -// Move the input arguments to stable places - - mov input_z, x0 - mov input_x, x1 - -// Main code, just a sequence of basic field operations - -// z2 = z^2 -// y2 = y^2 - - montsqr_p384(z2,z_1) - montsqr_p384(y2,y_1) - -// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) - - weakadd_p384(t1,x_1,z2) - sub_p384(t2,x_1,z2) - montmul_p384(x2p,t1,t2) - -// t1 = y + z -// x4p = x2p^2 -// xy2 = x * y^2 - - add_p384(t1,y_1,z_1) - montsqr_p384(x4p,x2p) - montmul_p384(xy2,x_1,y2) - -// t2 = (y + z)^2 - - montsqr_p384(t2,t1) - -// d = 12 * xy2 - 9 * x4p -// t1 = y^2 + 2 * y * z - - cmsub_p384(d,12,xy2,9,x4p) - sub_p384(t1,t2,z2) - -// y4 = y^4 - - montsqr_p384(y4,y2) - -// z_3' = 2 * y * z -// dx2 = d * x2p - - sub_p384(z_3,t1,y2) - montmul_p384(dx2,d,x2p) - -// x' = 4 * xy2 - d - - cmsub41_p384(x_3,xy2,d) - -// y' = 3 * dx2 - 8 * y4 - - cmsub38_p384(y_3,dx2,y4) - -// Restore stack and registers - - add sp, sp, NSPACE - - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits -#endif diff --git a/third_party/s2n-bignum/arm/p384/p384_montjmixadd.S b/third_party/s2n-bignum/arm/p384/p384_montjmixadd.S deleted file mode 100644 index f340e4f5ce6..00000000000 --- a/third_party/s2n-bignum/arm/p384/p384_montjmixadd.S +++ /dev/null @@ -1,876 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates -// -// extern void p384_montjmixadd -// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]); -// -// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with -// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. -// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). -// The "mixed" part means that p2 only has x and y coordinates, with the -// implicit z coordinate assumed to be the identity. -// -// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd) - S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd) - .text - .balign 4 - -// Size of individual field elements - -#define NUMSIZE 48 - -// Stable homes for input arguments during main code sequence - -#define input_z x24 -#define input_x x25 -#define input_y x26 - -// Pointer-offset pairs for inputs and outputs - -#define x_1 input_x, #0 -#define y_1 input_x, #NUMSIZE -#define z_1 input_x, #(2*NUMSIZE) - -#define x_2 input_y, #0 -#define y_2 input_y, #NUMSIZE - -#define x_3 input_z, #0 -#define y_3 input_z, #NUMSIZE -#define z_3 input_z, #(2*NUMSIZE) - -// Pointer-offset pairs for temporaries, with some aliasing -// NSPACE is the total stack needed for these temporaries - -#define zp2 sp, #(NUMSIZE*0) -#define ww sp, #(NUMSIZE*0) -#define resx sp, #(NUMSIZE*0) - -#define yd sp, #(NUMSIZE*1) -#define y2a sp, #(NUMSIZE*1) - -#define x2a sp, #(NUMSIZE*2) -#define zzx2 sp, #(NUMSIZE*2) - -#define zz sp, #(NUMSIZE*3) -#define t1 sp, #(NUMSIZE*3) - -#define t2 sp, #(NUMSIZE*4) -#define zzx1 sp, #(NUMSIZE*4) -#define resy sp, #(NUMSIZE*4) - -#define xd sp, #(NUMSIZE*5) -#define resz sp, #(NUMSIZE*5) - -#define NSPACE (NUMSIZE*6) - -// Corresponds to bignum_montmul_p384 except x24 -> x0 - -#define montmul_p384(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P1+32]; \ - ldp x9, x10, [P2]; \ - ldp x11, x12, [P2+16]; \ - ldp x13, x14, [P2+32]; \ - mul x15, x3, x9; \ - mul x21, x4, x10; \ - mul x22, x5, x11; \ - umulh x23, x3, x9; \ - umulh x0, x4, x10; \ - umulh x1, x5, x11; \ - adds x23, x23, x21; \ - adcs x0, x0, x22; \ - adc x1, x1, xzr; \ - adds x16, x23, x15; \ - adcs x17, x0, x23; \ - adcs x19, x1, x0; \ - adc x20, x1, xzr; \ - adds x17, x17, x15; \ - adcs x19, x19, x23; \ - adcs x20, x20, x0; \ - adc x1, x1, xzr; \ - subs x0, x3, x4; \ - cneg x0, x0, lo; \ - csetm x23, lo; \ - subs x22, x10, x9; \ - cneg x22, x22, lo; \ - mul x21, x0, x22; \ - umulh x22, x0, x22; \ - cinv x23, x23, lo; \ - eor x21, x21, x23; \ - eor x22, x22, x23; \ - cmn x23, #1; \ - adcs x16, x16, x21; \ - adcs x17, x17, x22; \ - adcs x19, x19, x23; \ - adcs x20, x20, x23; \ - adc x1, x1, x23; \ - subs x0, x3, x5; \ - cneg x0, x0, lo; \ - csetm x23, lo; \ - subs x22, x11, x9; \ - cneg x22, x22, lo; \ - mul x21, x0, x22; \ - umulh x22, x0, x22; \ - cinv x23, x23, lo; \ - eor x21, x21, x23; \ - eor x22, x22, x23; \ - cmn x23, #1; \ - adcs x17, x17, x21; \ - adcs x19, x19, x22; \ - adcs x20, x20, x23; \ - adc x1, x1, x23; \ - subs x0, x4, x5; \ - cneg x0, x0, lo; \ - csetm x23, lo; \ - subs x22, x11, x10; \ - cneg x22, x22, lo; \ - mul x21, x0, x22; \ - umulh x22, x0, x22; \ - cinv x23, x23, lo; \ - eor x21, x21, x23; \ - eor x22, x22, x23; \ - cmn x23, #1; \ - adcs x19, x19, x21; \ - adcs x20, x20, x22; \ - adc x1, x1, x23; \ - lsl x23, x15, #32; \ - add x15, x23, x15; \ - lsr x23, x15, #32; \ - subs x23, x23, x15; \ - sbc x22, x15, xzr; \ - extr x23, x22, x23, #32; \ - lsr x22, x22, #32; \ - adds x22, x22, x15; \ - adc x21, xzr, xzr; \ - subs x16, x16, x23; \ - sbcs x17, x17, x22; \ - sbcs x19, x19, x21; \ - sbcs x20, x20, xzr; \ - sbcs x1, x1, xzr; \ - sbc x15, x15, xzr; \ - lsl x23, x16, #32; \ - add x16, x23, x16; \ - lsr x23, x16, #32; \ - subs x23, x23, x16; \ - sbc x22, x16, xzr; \ - extr x23, x22, x23, #32; \ - lsr x22, x22, #32; \ - adds x22, x22, x16; \ - adc x21, xzr, xzr; \ - subs x17, x17, x23; \ - sbcs x19, x19, x22; \ - sbcs x20, x20, x21; \ - sbcs x1, x1, xzr; \ - sbcs x15, x15, xzr; \ - sbc x16, x16, xzr; \ - lsl x23, x17, #32; \ - add x17, x23, x17; \ - lsr x23, x17, #32; \ - subs x23, x23, x17; \ - sbc x22, x17, xzr; \ - extr x23, x22, x23, #32; \ - lsr x22, x22, #32; \ - adds x22, x22, x17; \ - adc x21, xzr, xzr; \ - subs x19, x19, x23; \ - sbcs x20, x20, x22; \ - sbcs x1, x1, x21; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbc x17, x17, xzr; \ - stp x19, x20, [P0]; \ - stp x1, x15, [P0+16]; \ - stp x16, x17, [P0+32]; \ - mul x15, x6, x12; \ - mul x21, x7, x13; \ - mul x22, x8, x14; \ - umulh x23, x6, x12; \ - umulh x0, x7, x13; \ - umulh x1, x8, x14; \ - adds x23, x23, x21; \ - adcs x0, x0, x22; \ - adc x1, x1, xzr; \ - adds x16, x23, x15; \ - adcs x17, x0, x23; \ - adcs x19, x1, x0; \ - adc x20, x1, xzr; \ - adds x17, x17, x15; \ - adcs x19, x19, x23; \ - adcs x20, x20, x0; \ - adc x1, x1, xzr; \ - subs x0, x6, x7; \ - cneg x0, x0, lo; \ - csetm x23, lo; \ - subs x22, x13, x12; \ - cneg x22, x22, lo; \ - mul x21, x0, x22; \ - umulh x22, x0, x22; \ - cinv x23, x23, lo; \ - eor x21, x21, x23; \ - eor x22, x22, x23; \ - cmn x23, #1; \ - adcs x16, x16, x21; \ - adcs x17, x17, x22; \ - adcs x19, x19, x23; \ - adcs x20, x20, x23; \ - adc x1, x1, x23; \ - subs x0, x6, x8; \ - cneg x0, x0, lo; \ - csetm x23, lo; \ - subs x22, x14, x12; \ - cneg x22, x22, lo; \ - mul x21, x0, x22; \ - umulh x22, x0, x22; \ - cinv x23, x23, lo; \ - eor x21, x21, x23; \ - eor x22, x22, x23; \ - cmn x23, #1; \ - adcs x17, x17, x21; \ - adcs x19, x19, x22; \ - adcs x20, x20, x23; \ - adc x1, x1, x23; \ - subs x0, x7, x8; \ - cneg x0, x0, lo; \ - csetm x23, lo; \ - subs x22, x14, x13; \ - cneg x22, x22, lo; \ - mul x21, x0, x22; \ - umulh x22, x0, x22; \ - cinv x23, x23, lo; \ - eor x21, x21, x23; \ - eor x22, x22, x23; \ - cmn x23, #1; \ - adcs x19, x19, x21; \ - adcs x20, x20, x22; \ - adc x1, x1, x23; \ - subs x6, x6, x3; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x5; \ - ngc x3, xzr; \ - cmn x3, #1; \ - eor x6, x6, x3; \ - adcs x6, x6, xzr; \ - eor x7, x7, x3; \ - adcs x7, x7, xzr; \ - eor x8, x8, x3; \ - adc x8, x8, xzr; \ - subs x9, x9, x12; \ - sbcs x10, x10, x13; \ - sbcs x11, x11, x14; \ - ngc x14, xzr; \ - cmn x14, #1; \ - eor x9, x9, x14; \ - adcs x9, x9, xzr; \ - eor x10, x10, x14; \ - adcs x10, x10, xzr; \ - eor x11, x11, x14; \ - adc x11, x11, xzr; \ - eor x14, x3, x14; \ - ldp x21, x22, [P0]; \ - adds x15, x15, x21; \ - adcs x16, x16, x22; \ - ldp x21, x22, [P0+16]; \ - adcs x17, x17, x21; \ - adcs x19, x19, x22; \ - ldp x21, x22, [P0+32]; \ - adcs x20, x20, x21; \ - adcs x1, x1, x22; \ - adc x2, xzr, xzr; \ - stp x15, x16, [P0]; \ - stp x17, x19, [P0+16]; \ - stp x20, x1, [P0+32]; \ - mul x15, x6, x9; \ - mul x21, x7, x10; \ - mul x22, x8, x11; \ - umulh x23, x6, x9; \ - umulh x0, x7, x10; \ - umulh x1, x8, x11; \ - adds x23, x23, x21; \ - adcs x0, x0, x22; \ - adc x1, x1, xzr; \ - adds x16, x23, x15; \ - adcs x17, x0, x23; \ - adcs x19, x1, x0; \ - adc x20, x1, xzr; \ - adds x17, x17, x15; \ - adcs x19, x19, x23; \ - adcs x20, x20, x0; \ - adc x1, x1, xzr; \ - subs x0, x6, x7; \ - cneg x0, x0, lo; \ - csetm x23, lo; \ - subs x22, x10, x9; \ - cneg x22, x22, lo; \ - mul x21, x0, x22; \ - umulh x22, x0, x22; \ - cinv x23, x23, lo; \ - eor x21, x21, x23; \ - eor x22, x22, x23; \ - cmn x23, #1; \ - adcs x16, x16, x21; \ - adcs x17, x17, x22; \ - adcs x19, x19, x23; \ - adcs x20, x20, x23; \ - adc x1, x1, x23; \ - subs x0, x6, x8; \ - cneg x0, x0, lo; \ - csetm x23, lo; \ - subs x22, x11, x9; \ - cneg x22, x22, lo; \ - mul x21, x0, x22; \ - umulh x22, x0, x22; \ - cinv x23, x23, lo; \ - eor x21, x21, x23; \ - eor x22, x22, x23; \ - cmn x23, #1; \ - adcs x17, x17, x21; \ - adcs x19, x19, x22; \ - adcs x20, x20, x23; \ - adc x1, x1, x23; \ - subs x0, x7, x8; \ - cneg x0, x0, lo; \ - csetm x23, lo; \ - subs x22, x11, x10; \ - cneg x22, x22, lo; \ - mul x21, x0, x22; \ - umulh x22, x0, x22; \ - cinv x23, x23, lo; \ - eor x21, x21, x23; \ - eor x22, x22, x23; \ - cmn x23, #1; \ - adcs x19, x19, x21; \ - adcs x20, x20, x22; \ - adc x1, x1, x23; \ - ldp x3, x4, [P0]; \ - ldp x5, x6, [P0+16]; \ - ldp x7, x8, [P0+32]; \ - cmn x14, #1; \ - eor x15, x15, x14; \ - adcs x15, x15, x3; \ - eor x16, x16, x14; \ - adcs x16, x16, x4; \ - eor x17, x17, x14; \ - adcs x17, x17, x5; \ - eor x19, x19, x14; \ - adcs x19, x19, x6; \ - eor x20, x20, x14; \ - adcs x20, x20, x7; \ - eor x1, x1, x14; \ - adcs x1, x1, x8; \ - adcs x9, x14, x2; \ - adcs x10, x14, xzr; \ - adcs x11, x14, xzr; \ - adc x12, x14, xzr; \ - adds x19, x19, x3; \ - adcs x20, x20, x4; \ - adcs x1, x1, x5; \ - adcs x9, x9, x6; \ - adcs x10, x10, x7; \ - adcs x11, x11, x8; \ - adc x12, x12, x2; \ - lsl x23, x15, #32; \ - add x15, x23, x15; \ - lsr x23, x15, #32; \ - subs x23, x23, x15; \ - sbc x22, x15, xzr; \ - extr x23, x22, x23, #32; \ - lsr x22, x22, #32; \ - adds x22, x22, x15; \ - adc x21, xzr, xzr; \ - subs x16, x16, x23; \ - sbcs x17, x17, x22; \ - sbcs x19, x19, x21; \ - sbcs x20, x20, xzr; \ - sbcs x1, x1, xzr; \ - sbc x15, x15, xzr; \ - lsl x23, x16, #32; \ - add x16, x23, x16; \ - lsr x23, x16, #32; \ - subs x23, x23, x16; \ - sbc x22, x16, xzr; \ - extr x23, x22, x23, #32; \ - lsr x22, x22, #32; \ - adds x22, x22, x16; \ - adc x21, xzr, xzr; \ - subs x17, x17, x23; \ - sbcs x19, x19, x22; \ - sbcs x20, x20, x21; \ - sbcs x1, x1, xzr; \ - sbcs x15, x15, xzr; \ - sbc x16, x16, xzr; \ - lsl x23, x17, #32; \ - add x17, x23, x17; \ - lsr x23, x17, #32; \ - subs x23, x23, x17; \ - sbc x22, x17, xzr; \ - extr x23, x22, x23, #32; \ - lsr x22, x22, #32; \ - adds x22, x22, x17; \ - adc x21, xzr, xzr; \ - subs x19, x19, x23; \ - sbcs x20, x20, x22; \ - sbcs x1, x1, x21; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbc x17, x17, xzr; \ - adds x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adc x12, x12, xzr; \ - add x22, x12, #1; \ - lsl x21, x22, #32; \ - subs x0, x22, x21; \ - sbc x21, x21, xzr; \ - adds x19, x19, x0; \ - adcs x20, x20, x21; \ - adcs x1, x1, x22; \ - adcs x9, x9, xzr; \ - adcs x10, x10, xzr; \ - adcs x11, x11, xzr; \ - csetm x22, lo; \ - mov x23, #4294967295; \ - and x23, x23, x22; \ - adds x19, x19, x23; \ - eor x23, x23, x22; \ - adcs x20, x20, x23; \ - mov x23, #-2; \ - and x23, x23, x22; \ - adcs x1, x1, x23; \ - adcs x9, x9, x22; \ - adcs x10, x10, x22; \ - adc x11, x11, x22; \ - stp x19, x20, [P0]; \ - stp x1, x9, [P0+16]; \ - stp x10, x11, [P0+32] - -// Corresponds exactly to bignum_montsqr_p384 - -#define montsqr_p384(P0,P1) \ - ldp x2, x3, [P1]; \ - ldp x4, x5, [P1+16]; \ - ldp x6, x7, [P1+32]; \ - mul x14, x2, x3; \ - mul x15, x2, x4; \ - mul x16, x3, x4; \ - mul x8, x2, x2; \ - mul x10, x3, x3; \ - mul x12, x4, x4; \ - umulh x17, x2, x3; \ - adds x15, x15, x17; \ - umulh x17, x2, x4; \ - adcs x16, x16, x17; \ - umulh x17, x3, x4; \ - adcs x17, x17, xzr; \ - umulh x9, x2, x2; \ - umulh x11, x3, x3; \ - umulh x13, x4, x4; \ - adds x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adc x13, x13, xzr; \ - adds x9, x9, x14; \ - adcs x10, x10, x15; \ - adcs x11, x11, x16; \ - adcs x12, x12, x17; \ - adc x13, x13, xzr; \ - lsl x16, x8, #32; \ - add x8, x16, x8; \ - lsr x16, x8, #32; \ - subs x16, x16, x8; \ - sbc x15, x8, xzr; \ - extr x16, x15, x16, #32; \ - lsr x15, x15, #32; \ - adds x15, x15, x8; \ - adc x14, xzr, xzr; \ - subs x9, x9, x16; \ - sbcs x10, x10, x15; \ - sbcs x11, x11, x14; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x8, x8, xzr; \ - lsl x16, x9, #32; \ - add x9, x16, x9; \ - lsr x16, x9, #32; \ - subs x16, x16, x9; \ - sbc x15, x9, xzr; \ - extr x16, x15, x16, #32; \ - lsr x15, x15, #32; \ - adds x15, x15, x9; \ - adc x14, xzr, xzr; \ - subs x10, x10, x16; \ - sbcs x11, x11, x15; \ - sbcs x12, x12, x14; \ - sbcs x13, x13, xzr; \ - sbcs x8, x8, xzr; \ - sbc x9, x9, xzr; \ - lsl x16, x10, #32; \ - add x10, x16, x10; \ - lsr x16, x10, #32; \ - subs x16, x16, x10; \ - sbc x15, x10, xzr; \ - extr x16, x15, x16, #32; \ - lsr x15, x15, #32; \ - adds x15, x15, x10; \ - adc x14, xzr, xzr; \ - subs x11, x11, x16; \ - sbcs x12, x12, x15; \ - sbcs x13, x13, x14; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - stp x11, x12, [P0]; \ - stp x13, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - mul x8, x2, x5; \ - mul x14, x3, x6; \ - mul x15, x4, x7; \ - umulh x16, x2, x5; \ - umulh x17, x3, x6; \ - umulh x1, x4, x7; \ - adds x16, x16, x14; \ - adcs x17, x17, x15; \ - adc x1, x1, xzr; \ - adds x9, x16, x8; \ - adcs x10, x17, x16; \ - adcs x11, x1, x17; \ - adc x12, x1, xzr; \ - adds x10, x10, x8; \ - adcs x11, x11, x16; \ - adcs x12, x12, x17; \ - adc x13, x1, xzr; \ - subs x17, x2, x3; \ - cneg x17, x17, lo; \ - csetm x14, lo; \ - subs x15, x6, x5; \ - cneg x15, x15, lo; \ - mul x16, x17, x15; \ - umulh x15, x17, x15; \ - cinv x14, x14, lo; \ - eor x16, x16, x14; \ - eor x15, x15, x14; \ - cmn x14, #1; \ - adcs x9, x9, x16; \ - adcs x10, x10, x15; \ - adcs x11, x11, x14; \ - adcs x12, x12, x14; \ - adc x13, x13, x14; \ - subs x17, x2, x4; \ - cneg x17, x17, lo; \ - csetm x14, lo; \ - subs x15, x7, x5; \ - cneg x15, x15, lo; \ - mul x16, x17, x15; \ - umulh x15, x17, x15; \ - cinv x14, x14, lo; \ - eor x16, x16, x14; \ - eor x15, x15, x14; \ - cmn x14, #1; \ - adcs x10, x10, x16; \ - adcs x11, x11, x15; \ - adcs x12, x12, x14; \ - adc x13, x13, x14; \ - subs x17, x3, x4; \ - cneg x17, x17, lo; \ - csetm x14, lo; \ - subs x15, x7, x6; \ - cneg x15, x15, lo; \ - mul x16, x17, x15; \ - umulh x15, x17, x15; \ - cinv x14, x14, lo; \ - eor x16, x16, x14; \ - eor x15, x15, x14; \ - cmn x14, #1; \ - adcs x11, x11, x16; \ - adcs x12, x12, x15; \ - adc x13, x13, x14; \ - adds x8, x8, x8; \ - adcs x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adc x17, xzr, xzr; \ - ldp x2, x3, [P0]; \ - adds x8, x8, x2; \ - adcs x9, x9, x3; \ - ldp x2, x3, [P0+16]; \ - adcs x10, x10, x2; \ - adcs x11, x11, x3; \ - ldp x2, x3, [P0+32]; \ - adcs x12, x12, x2; \ - adcs x13, x13, x3; \ - adc x17, x17, xzr; \ - lsl x4, x8, #32; \ - add x8, x4, x8; \ - lsr x4, x8, #32; \ - subs x4, x4, x8; \ - sbc x3, x8, xzr; \ - extr x4, x3, x4, #32; \ - lsr x3, x3, #32; \ - adds x3, x3, x8; \ - adc x2, xzr, xzr; \ - subs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, x2; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x8, x8, xzr; \ - lsl x4, x9, #32; \ - add x9, x4, x9; \ - lsr x4, x9, #32; \ - subs x4, x4, x9; \ - sbc x3, x9, xzr; \ - extr x4, x3, x4, #32; \ - lsr x3, x3, #32; \ - adds x3, x3, x9; \ - adc x2, xzr, xzr; \ - subs x10, x10, x4; \ - sbcs x11, x11, x3; \ - sbcs x12, x12, x2; \ - sbcs x13, x13, xzr; \ - sbcs x8, x8, xzr; \ - sbc x9, x9, xzr; \ - lsl x4, x10, #32; \ - add x10, x4, x10; \ - lsr x4, x10, #32; \ - subs x4, x4, x10; \ - sbc x3, x10, xzr; \ - extr x4, x3, x4, #32; \ - lsr x3, x3, #32; \ - adds x3, x3, x10; \ - adc x2, xzr, xzr; \ - subs x11, x11, x4; \ - sbcs x12, x12, x3; \ - sbcs x13, x13, x2; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - adds x17, x17, x8; \ - adcs x8, x9, xzr; \ - adcs x9, x10, xzr; \ - adcs x10, xzr, xzr; \ - mul x1, x5, x5; \ - adds x11, x11, x1; \ - mul x14, x6, x6; \ - mul x15, x7, x7; \ - umulh x1, x5, x5; \ - adcs x12, x12, x1; \ - umulh x1, x6, x6; \ - adcs x13, x13, x14; \ - adcs x17, x17, x1; \ - umulh x1, x7, x7; \ - adcs x8, x8, x15; \ - adcs x9, x9, x1; \ - adc x10, x10, xzr; \ - mul x1, x5, x6; \ - mul x14, x5, x7; \ - mul x15, x6, x7; \ - umulh x16, x5, x6; \ - adds x14, x14, x16; \ - umulh x16, x5, x7; \ - adcs x15, x15, x16; \ - umulh x16, x6, x7; \ - adc x16, x16, xzr; \ - adds x1, x1, x1; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adc x5, xzr, xzr; \ - adds x12, x12, x1; \ - adcs x13, x13, x14; \ - adcs x17, x17, x15; \ - adcs x8, x8, x16; \ - adcs x9, x9, x5; \ - adc x10, x10, xzr; \ - mov x1, #-4294967295; \ - mov x14, #4294967295; \ - mov x15, #1; \ - cmn x11, x1; \ - adcs xzr, x12, x14; \ - adcs xzr, x13, x15; \ - adcs xzr, x17, xzr; \ - adcs xzr, x8, xzr; \ - adcs xzr, x9, xzr; \ - adc x10, x10, xzr; \ - neg x10, x10; \ - and x1, x1, x10; \ - adds x11, x11, x1; \ - and x14, x14, x10; \ - adcs x12, x12, x14; \ - and x15, x15, x10; \ - adcs x13, x13, x15; \ - adcs x17, x17, xzr; \ - adcs x8, x8, xzr; \ - adc x9, x9, xzr; \ - stp x11, x12, [P0]; \ - stp x13, x17, [P0+16]; \ - stp x8, x9, [P0+32] - -// Corresponds exactly to bignum_sub_p384 - -#define sub_p384(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - csetm x3, lo; \ - mov x4, #4294967295; \ - and x4, x4, x3; \ - adds x5, x5, x4; \ - eor x4, x4, x3; \ - adcs x6, x6, x4; \ - mov x4, #-2; \ - and x4, x4, x3; \ - adcs x7, x7, x4; \ - adcs x8, x8, x3; \ - adcs x9, x9, x3; \ - adc x10, x10, x3; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32] - -S2N_BN_SYMBOL(p384_montjmixadd): - -// Save regs and make room on stack for temporary variables - - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - stp x25, x26, [sp, #-16]! - sub sp, sp, NSPACE - -// Move the input arguments to stable places - - mov input_z, x0 - mov input_x, x1 - mov input_y, x2 - -// Main code, just a sequence of basic field operations -// 8 * multiply + 3 * square + 7 * subtract - - montsqr_p384(zp2,z_1) - montmul_p384(y2a,z_1,y_2) - - montmul_p384(x2a,zp2,x_2) - montmul_p384(y2a,zp2,y2a) - - sub_p384(xd,x2a,x_1) - sub_p384(yd,y2a,y_1) - - montsqr_p384(zz,xd) - montsqr_p384(ww,yd) - - montmul_p384(zzx1,zz,x_1) - montmul_p384(zzx2,zz,x2a) - - sub_p384(resx,ww,zzx1) - sub_p384(t1,zzx2,zzx1) - - montmul_p384(resz,xd,z_1) - - sub_p384(resx,resx,zzx2) - - sub_p384(t2,zzx1,resx) - - montmul_p384(t1,t1,y_1) - montmul_p384(t2,yd,t2) - - sub_p384(resy,t2,t1) - -// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) - - ldp x0, x1, [z_1] - ldp x2, x3, [z_1+16] - ldp x4, x5, [z_1+32] - orr x6, x0, x1 - orr x7, x2, x3 - orr x8, x4, x5 - orr x6, x6, x7 - orr x6, x6, x8 - cmp x6, xzr - -// Multiplex: if p1 <> 0 just copy the computed result from the staging area. -// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in -// Montgomery form so not the simple constant 1 but rather 2^384 - p_384), -// hence giving 0 + p2 = p2 for the final result. - - ldp x0, x1, [resx] - ldp x19, x20, [x_2] - csel x0, x0, x19, ne - csel x1, x1, x20, ne - ldp x2, x3, [resx+16] - ldp x19, x20, [x_2+16] - csel x2, x2, x19, ne - csel x3, x3, x20, ne - ldp x4, x5, [resx+32] - ldp x19, x20, [x_2+32] - csel x4, x4, x19, ne - csel x5, x5, x20, ne - - ldp x6, x7, [resy] - ldp x19, x20, [y_2] - csel x6, x6, x19, ne - csel x7, x7, x20, ne - ldp x8, x9, [resy+16] - ldp x19, x20, [y_2+16] - csel x8, x8, x19, ne - csel x9, x9, x20, ne - ldp x10, x11, [resy+32] - ldp x19, x20, [y_2+32] - csel x10, x10, x19, ne - csel x11, x11, x20, ne - - ldp x12, x13, [resz] - mov x19, #0xffffffff00000001 - mov x20, #0x00000000ffffffff - csel x12, x12, x19, ne - csel x13, x13, x20, ne - ldp x14, x15, [resz+16] - mov x19, #1 - csel x14, x14, x19, ne - csel x15, x15, xzr, ne - ldp x16, x17, [resz+32] - csel x16, x16, xzr, ne - csel x17, x17, xzr, ne - - stp x0, x1, [x_3] - stp x2, x3, [x_3+16] - stp x4, x5, [x_3+32] - stp x6, x7, [y_3] - stp x8, x9, [y_3+16] - stp x10, x11, [y_3+32] - stp x12, x13, [z_3] - stp x14, x15, [z_3+16] - stp x16, x17, [z_3+32] - -// Restore stack and registers - - add sp, sp, NSPACE - - ldp x25, x26, [sp], 16 - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits -#endif diff --git a/third_party/s2n-bignum/arm/p384/p384_montjmixadd_alt.S b/third_party/s2n-bignum/arm/p384/p384_montjmixadd_alt.S deleted file mode 100644 index f36301a11ed..00000000000 --- a/third_party/s2n-bignum/arm/p384/p384_montjmixadd_alt.S +++ /dev/null @@ -1,941 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates -// -// extern void p384_montjmixadd_alt -// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]); -// -// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with -// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. -// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). -// The "mixed" part means that p2 only has x and y coordinates, with the -// implicit z coordinate assumed to be the identity. -// -// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd_alt) - S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd_alt) - .text - .balign 4 - -// Size of individual field elements - -#define NUMSIZE 48 - -// Stable homes for input arguments during main code sequence - -#define input_z x24 -#define input_x x25 -#define input_y x26 - -// Pointer-offset pairs for inputs and outputs - -#define x_1 input_x, #0 -#define y_1 input_x, #NUMSIZE -#define z_1 input_x, #(2*NUMSIZE) - -#define x_2 input_y, #0 -#define y_2 input_y, #NUMSIZE - -#define x_3 input_z, #0 -#define y_3 input_z, #NUMSIZE -#define z_3 input_z, #(2*NUMSIZE) - -// Pointer-offset pairs for temporaries, with some aliasing -// NSPACE is the total stack needed for these temporaries - -#define zp2 sp, #(NUMSIZE*0) -#define ww sp, #(NUMSIZE*0) -#define resx sp, #(NUMSIZE*0) - -#define yd sp, #(NUMSIZE*1) -#define y2a sp, #(NUMSIZE*1) - -#define x2a sp, #(NUMSIZE*2) -#define zzx2 sp, #(NUMSIZE*2) - -#define zz sp, #(NUMSIZE*3) -#define t1 sp, #(NUMSIZE*3) - -#define t2 sp, #(NUMSIZE*4) -#define zzx1 sp, #(NUMSIZE*4) -#define resy sp, #(NUMSIZE*4) - -#define xd sp, #(NUMSIZE*5) -#define resz sp, #(NUMSIZE*5) - -#define NSPACE (NUMSIZE*6) - -// Corresponds exactly to bignum_montmul_p384_alt - -#define montmul_p384(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x12, x3, x5; \ - umulh x13, x3, x5; \ - mul x11, x3, x6; \ - umulh x14, x3, x6; \ - adds x13, x13, x11; \ - ldp x7, x8, [P2+16]; \ - mul x11, x3, x7; \ - umulh x15, x3, x7; \ - adcs x14, x14, x11; \ - mul x11, x3, x8; \ - umulh x16, x3, x8; \ - adcs x15, x15, x11; \ - ldp x9, x10, [P2+32]; \ - mul x11, x3, x9; \ - umulh x17, x3, x9; \ - adcs x16, x16, x11; \ - mul x11, x3, x10; \ - umulh x19, x3, x10; \ - adcs x17, x17, x11; \ - adc x19, x19, xzr; \ - mul x11, x4, x5; \ - adds x13, x13, x11; \ - mul x11, x4, x6; \ - adcs x14, x14, x11; \ - mul x11, x4, x7; \ - adcs x15, x15, x11; \ - mul x11, x4, x8; \ - adcs x16, x16, x11; \ - mul x11, x4, x9; \ - adcs x17, x17, x11; \ - mul x11, x4, x10; \ - adcs x19, x19, x11; \ - cset x20, cs; \ - umulh x11, x4, x5; \ - adds x14, x14, x11; \ - umulh x11, x4, x6; \ - adcs x15, x15, x11; \ - umulh x11, x4, x7; \ - adcs x16, x16, x11; \ - umulh x11, x4, x8; \ - adcs x17, x17, x11; \ - umulh x11, x4, x9; \ - adcs x19, x19, x11; \ - umulh x11, x4, x10; \ - adc x20, x20, x11; \ - ldp x3, x4, [P1+16]; \ - mul x11, x3, x5; \ - adds x14, x14, x11; \ - mul x11, x3, x6; \ - adcs x15, x15, x11; \ - mul x11, x3, x7; \ - adcs x16, x16, x11; \ - mul x11, x3, x8; \ - adcs x17, x17, x11; \ - mul x11, x3, x9; \ - adcs x19, x19, x11; \ - mul x11, x3, x10; \ - adcs x20, x20, x11; \ - cset x21, cs; \ - umulh x11, x3, x5; \ - adds x15, x15, x11; \ - umulh x11, x3, x6; \ - adcs x16, x16, x11; \ - umulh x11, x3, x7; \ - adcs x17, x17, x11; \ - umulh x11, x3, x8; \ - adcs x19, x19, x11; \ - umulh x11, x3, x9; \ - adcs x20, x20, x11; \ - umulh x11, x3, x10; \ - adc x21, x21, x11; \ - mul x11, x4, x5; \ - adds x15, x15, x11; \ - mul x11, x4, x6; \ - adcs x16, x16, x11; \ - mul x11, x4, x7; \ - adcs x17, x17, x11; \ - mul x11, x4, x8; \ - adcs x19, x19, x11; \ - mul x11, x4, x9; \ - adcs x20, x20, x11; \ - mul x11, x4, x10; \ - adcs x21, x21, x11; \ - cset x22, cs; \ - umulh x11, x4, x5; \ - adds x16, x16, x11; \ - umulh x11, x4, x6; \ - adcs x17, x17, x11; \ - umulh x11, x4, x7; \ - adcs x19, x19, x11; \ - umulh x11, x4, x8; \ - adcs x20, x20, x11; \ - umulh x11, x4, x9; \ - adcs x21, x21, x11; \ - umulh x11, x4, x10; \ - adc x22, x22, x11; \ - ldp x3, x4, [P1+32]; \ - mul x11, x3, x5; \ - adds x16, x16, x11; \ - mul x11, x3, x6; \ - adcs x17, x17, x11; \ - mul x11, x3, x7; \ - adcs x19, x19, x11; \ - mul x11, x3, x8; \ - adcs x20, x20, x11; \ - mul x11, x3, x9; \ - adcs x21, x21, x11; \ - mul x11, x3, x10; \ - adcs x22, x22, x11; \ - cset x2, cs; \ - umulh x11, x3, x5; \ - adds x17, x17, x11; \ - umulh x11, x3, x6; \ - adcs x19, x19, x11; \ - umulh x11, x3, x7; \ - adcs x20, x20, x11; \ - umulh x11, x3, x8; \ - adcs x21, x21, x11; \ - umulh x11, x3, x9; \ - adcs x22, x22, x11; \ - umulh x11, x3, x10; \ - adc x2, x2, x11; \ - mul x11, x4, x5; \ - adds x17, x17, x11; \ - mul x11, x4, x6; \ - adcs x19, x19, x11; \ - mul x11, x4, x7; \ - adcs x20, x20, x11; \ - mul x11, x4, x8; \ - adcs x21, x21, x11; \ - mul x11, x4, x9; \ - adcs x22, x22, x11; \ - mul x11, x4, x10; \ - adcs x2, x2, x11; \ - cset x1, cs; \ - umulh x11, x4, x5; \ - adds x19, x19, x11; \ - umulh x11, x4, x6; \ - adcs x20, x20, x11; \ - umulh x11, x4, x7; \ - adcs x21, x21, x11; \ - umulh x11, x4, x8; \ - adcs x22, x22, x11; \ - umulh x11, x4, x9; \ - adcs x2, x2, x11; \ - umulh x11, x4, x10; \ - adc x1, x1, x11; \ - lsl x7, x12, #32; \ - add x12, x7, x12; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x12; \ - mov x6, #0xffffffff; \ - mul x5, x6, x12; \ - umulh x6, x6, x12; \ - adds x7, x7, x5; \ - adcs x6, x6, x12; \ - adc x5, xzr, xzr; \ - subs x13, x13, x7; \ - sbcs x14, x14, x6; \ - sbcs x15, x15, x5; \ - sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x12, x12, xzr; \ - lsl x7, x13, #32; \ - add x13, x7, x13; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x13; \ - mov x6, #0xffffffff; \ - mul x5, x6, x13; \ - umulh x6, x6, x13; \ - adds x7, x7, x5; \ - adcs x6, x6, x13; \ - adc x5, xzr, xzr; \ - subs x14, x14, x7; \ - sbcs x15, x15, x6; \ - sbcs x16, x16, x5; \ - sbcs x17, x17, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - lsl x7, x14, #32; \ - add x14, x7, x14; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x14; \ - mov x6, #0xffffffff; \ - mul x5, x6, x14; \ - umulh x6, x6, x14; \ - adds x7, x7, x5; \ - adcs x6, x6, x14; \ - adc x5, xzr, xzr; \ - subs x15, x15, x7; \ - sbcs x16, x16, x6; \ - sbcs x17, x17, x5; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x14, x14, xzr; \ - lsl x7, x15, #32; \ - add x15, x7, x15; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x15; \ - mov x6, #0xffffffff; \ - mul x5, x6, x15; \ - umulh x6, x6, x15; \ - adds x7, x7, x5; \ - adcs x6, x6, x15; \ - adc x5, xzr, xzr; \ - subs x16, x16, x7; \ - sbcs x17, x17, x6; \ - sbcs x12, x12, x5; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbc x15, x15, xzr; \ - lsl x7, x16, #32; \ - add x16, x7, x16; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x16; \ - mov x6, #0xffffffff; \ - mul x5, x6, x16; \ - umulh x6, x6, x16; \ - adds x7, x7, x5; \ - adcs x6, x6, x16; \ - adc x5, xzr, xzr; \ - subs x17, x17, x7; \ - sbcs x12, x12, x6; \ - sbcs x13, x13, x5; \ - sbcs x14, x14, xzr; \ - sbcs x15, x15, xzr; \ - sbc x16, x16, xzr; \ - lsl x7, x17, #32; \ - add x17, x7, x17; \ - mov x7, #0xffffffff00000001; \ - umulh x7, x7, x17; \ - mov x6, #0xffffffff; \ - mul x5, x6, x17; \ - umulh x6, x6, x17; \ - adds x7, x7, x5; \ - adcs x6, x6, x17; \ - adc x5, xzr, xzr; \ - subs x12, x12, x7; \ - sbcs x13, x13, x6; \ - sbcs x14, x14, x5; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbc x17, x17, xzr; \ - adds x12, x12, x19; \ - adcs x13, x13, x20; \ - adcs x14, x14, x21; \ - adcs x15, x15, x22; \ - adcs x16, x16, x2; \ - adcs x17, x17, x1; \ - adc x10, xzr, xzr; \ - mov x11, #0xffffffff00000001; \ - adds x19, x12, x11; \ - mov x11, #0xffffffff; \ - adcs x20, x13, x11; \ - mov x11, #0x1; \ - adcs x21, x14, x11; \ - adcs x22, x15, xzr; \ - adcs x2, x16, xzr; \ - adcs x1, x17, xzr; \ - adcs x10, x10, xzr; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - csel x14, x14, x21, eq; \ - csel x15, x15, x22, eq; \ - csel x16, x16, x2, eq; \ - csel x17, x17, x1, eq; \ - stp x12, x13, [P0]; \ - stp x14, x15, [P0+16]; \ - stp x16, x17, [P0+32] - -// Corresponds exactly to bignum_montsqr_p384_alt - -#define montsqr_p384(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ - adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ - ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ - adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ - sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - adc x6, xzr, xzr; \ - mov x8, #-4294967295; \ - adds x14, x2, x8; \ - mov x8, #4294967295; \ - adcs x15, x9, x8; \ - mov x8, #1; \ - adcs x16, x10, x8; \ - adcs x17, x11, xzr; \ - adcs x19, x12, xzr; \ - adcs x20, x13, xzr; \ - adcs x6, x6, xzr; \ - csel x2, x2, x14, eq; \ - csel x9, x9, x15, eq; \ - csel x10, x10, x16, eq; \ - csel x11, x11, x17, eq; \ - csel x12, x12, x19, eq; \ - csel x13, x13, x20, eq; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] - -// Almost-Montgomery variant which we use when an input to other muls -// with the other argument fully reduced (which is always safe). In -// fact, with the Karatsuba-based Montgomery mul here, we don't even -// *need* the restriction that the other argument is reduced. - -#define amontsqr_p384(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x8, x2, x4; \ - adds x10, x10, x8; \ - mul x11, x2, x5; \ - mul x8, x3, x4; \ - adcs x11, x11, x8; \ - umulh x12, x2, x5; \ - mul x8, x3, x5; \ - adcs x12, x12, x8; \ - ldp x6, x7, [P1+32]; \ - mul x13, x2, x7; \ - mul x8, x3, x6; \ - adcs x13, x13, x8; \ - umulh x14, x2, x7; \ - mul x8, x3, x7; \ - adcs x14, x14, x8; \ - mul x15, x5, x6; \ - adcs x15, x15, xzr; \ - umulh x16, x5, x6; \ - adc x16, x16, xzr; \ - umulh x8, x2, x4; \ - adds x11, x11, x8; \ - umulh x8, x3, x4; \ - adcs x12, x12, x8; \ - umulh x8, x3, x5; \ - adcs x13, x13, x8; \ - umulh x8, x3, x6; \ - adcs x14, x14, x8; \ - umulh x8, x3, x7; \ - adcs x15, x15, x8; \ - adc x16, x16, xzr; \ - mul x8, x2, x6; \ - adds x12, x12, x8; \ - mul x8, x4, x5; \ - adcs x13, x13, x8; \ - mul x8, x4, x6; \ - adcs x14, x14, x8; \ - mul x8, x4, x7; \ - adcs x15, x15, x8; \ - mul x8, x5, x7; \ - adcs x16, x16, x8; \ - mul x17, x6, x7; \ - adcs x17, x17, xzr; \ - umulh x19, x6, x7; \ - adc x19, x19, xzr; \ - umulh x8, x2, x6; \ - adds x13, x13, x8; \ - umulh x8, x4, x5; \ - adcs x14, x14, x8; \ - umulh x8, x4, x6; \ - adcs x15, x15, x8; \ - umulh x8, x4, x7; \ - adcs x16, x16, x8; \ - umulh x8, x5, x7; \ - adcs x17, x17, x8; \ - adc x19, x19, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - cset x20, hs; \ - umulh x8, x2, x2; \ - mul x2, x2, x2; \ - adds x9, x9, x8; \ - mul x8, x3, x3; \ - adcs x10, x10, x8; \ - umulh x8, x3, x3; \ - adcs x11, x11, x8; \ - mul x8, x4, x4; \ - adcs x12, x12, x8; \ - umulh x8, x4, x4; \ - adcs x13, x13, x8; \ - mul x8, x5, x5; \ - adcs x14, x14, x8; \ - umulh x8, x5, x5; \ - adcs x15, x15, x8; \ - mul x8, x6, x6; \ - adcs x16, x16, x8; \ - umulh x8, x6, x6; \ - adcs x17, x17, x8; \ - mul x8, x7, x7; \ - adcs x19, x19, x8; \ - umulh x8, x7, x7; \ - adc x20, x20, x8; \ - lsl x5, x2, #32; \ - add x2, x5, x2; \ - mov x5, #-4294967295; \ - umulh x5, x5, x2; \ - mov x4, #4294967295; \ - mul x3, x4, x2; \ - umulh x4, x4, x2; \ - adds x5, x5, x3; \ - adcs x4, x4, x2; \ - adc x3, xzr, xzr; \ - subs x9, x9, x5; \ - sbcs x10, x10, x4; \ - sbcs x11, x11, x3; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbc x2, x2, xzr; \ - lsl x5, x9, #32; \ - add x9, x5, x9; \ - mov x5, #-4294967295; \ - umulh x5, x5, x9; \ - mov x4, #4294967295; \ - mul x3, x4, x9; \ - umulh x4, x4, x9; \ - adds x5, x5, x3; \ - adcs x4, x4, x9; \ - adc x3, xzr, xzr; \ - subs x10, x10, x5; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - sbcs x13, x13, xzr; \ - sbcs x2, x2, xzr; \ - sbc x9, x9, xzr; \ - lsl x5, x10, #32; \ - add x10, x5, x10; \ - mov x5, #-4294967295; \ - umulh x5, x5, x10; \ - mov x4, #4294967295; \ - mul x3, x4, x10; \ - umulh x4, x4, x10; \ - adds x5, x5, x3; \ - adcs x4, x4, x10; \ - adc x3, xzr, xzr; \ - subs x11, x11, x5; \ - sbcs x12, x12, x4; \ - sbcs x13, x13, x3; \ - sbcs x2, x2, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - lsl x5, x11, #32; \ - add x11, x5, x11; \ - mov x5, #-4294967295; \ - umulh x5, x5, x11; \ - mov x4, #4294967295; \ - mul x3, x4, x11; \ - umulh x4, x4, x11; \ - adds x5, x5, x3; \ - adcs x4, x4, x11; \ - adc x3, xzr, xzr; \ - subs x12, x12, x5; \ - sbcs x13, x13, x4; \ - sbcs x2, x2, x3; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbc x11, x11, xzr; \ - lsl x5, x12, #32; \ - add x12, x5, x12; \ - mov x5, #-4294967295; \ - umulh x5, x5, x12; \ - mov x4, #4294967295; \ - mul x3, x4, x12; \ - umulh x4, x4, x12; \ - adds x5, x5, x3; \ - adcs x4, x4, x12; \ - adc x3, xzr, xzr; \ - subs x13, x13, x5; \ - sbcs x2, x2, x4; \ - sbcs x9, x9, x3; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbc x12, x12, xzr; \ - lsl x5, x13, #32; \ - add x13, x5, x13; \ - mov x5, #-4294967295; \ - umulh x5, x5, x13; \ - mov x4, #4294967295; \ - mul x3, x4, x13; \ - umulh x4, x4, x13; \ - adds x5, x5, x3; \ - adcs x4, x4, x13; \ - adc x3, xzr, xzr; \ - subs x2, x2, x5; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, x17; \ - adcs x12, x12, x19; \ - adcs x13, x13, x20; \ - mov x14, #-4294967295; \ - mov x15, #4294967295; \ - csel x14, x14, xzr, cs; \ - csel x15, x15, xzr, cs; \ - cset x16, cs; \ - adds x2, x2, x14; \ - adcs x9, x9, x15; \ - adcs x10, x10, x16; \ - adcs x11, x11, xzr; \ - adcs x12, x12, xzr; \ - adc x13, x13, xzr; \ - stp x2, x9, [P0]; \ - stp x10, x11, [P0+16]; \ - stp x12, x13, [P0+32] - -// Corresponds exactly to bignum_sub_p384 - -#define sub_p384(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - csetm x3, lo; \ - mov x4, #4294967295; \ - and x4, x4, x3; \ - adds x5, x5, x4; \ - eor x4, x4, x3; \ - adcs x6, x6, x4; \ - mov x4, #-2; \ - and x4, x4, x3; \ - adcs x7, x7, x4; \ - adcs x8, x8, x3; \ - adcs x9, x9, x3; \ - adc x10, x10, x3; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32] - -S2N_BN_SYMBOL(p384_montjmixadd_alt): - -// Save regs and make room on stack for temporary variables - - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - stp x25, x26, [sp, #-16]! - sub sp, sp, NSPACE - -// Move the input arguments to stable places - - mov input_z, x0 - mov input_x, x1 - mov input_y, x2 - -// Main code, just a sequence of basic field operations -// 8 * multiply + 3 * square + 7 * subtract - - amontsqr_p384(zp2,z_1) - montmul_p384(y2a,z_1,y_2) - - montmul_p384(x2a,zp2,x_2) - montmul_p384(y2a,zp2,y2a) - - sub_p384(xd,x2a,x_1) - sub_p384(yd,y2a,y_1) - - amontsqr_p384(zz,xd) - montsqr_p384(ww,yd) - - montmul_p384(zzx1,zz,x_1) - montmul_p384(zzx2,zz,x2a) - - sub_p384(resx,ww,zzx1) - sub_p384(t1,zzx2,zzx1) - - montmul_p384(resz,xd,z_1) - - sub_p384(resx,resx,zzx2) - - sub_p384(t2,zzx1,resx) - - montmul_p384(t1,t1,y_1) - montmul_p384(t2,yd,t2) - - sub_p384(resy,t2,t1) - -// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) - - ldp x0, x1, [z_1] - ldp x2, x3, [z_1+16] - ldp x4, x5, [z_1+32] - orr x6, x0, x1 - orr x7, x2, x3 - orr x8, x4, x5 - orr x6, x6, x7 - orr x6, x6, x8 - cmp x6, xzr - -// Multiplex: if p1 <> 0 just copy the computed result from the staging area. -// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in -// Montgomery form so not the simple constant 1 but rather 2^384 - p_384), -// hence giving 0 + p2 = p2 for the final result. - - ldp x0, x1, [resx] - ldp x19, x20, [x_2] - csel x0, x0, x19, ne - csel x1, x1, x20, ne - ldp x2, x3, [resx+16] - ldp x19, x20, [x_2+16] - csel x2, x2, x19, ne - csel x3, x3, x20, ne - ldp x4, x5, [resx+32] - ldp x19, x20, [x_2+32] - csel x4, x4, x19, ne - csel x5, x5, x20, ne - - ldp x6, x7, [resy] - ldp x19, x20, [y_2] - csel x6, x6, x19, ne - csel x7, x7, x20, ne - ldp x8, x9, [resy+16] - ldp x19, x20, [y_2+16] - csel x8, x8, x19, ne - csel x9, x9, x20, ne - ldp x10, x11, [resy+32] - ldp x19, x20, [y_2+32] - csel x10, x10, x19, ne - csel x11, x11, x20, ne - - ldp x12, x13, [resz] - mov x19, #0xffffffff00000001 - mov x20, #0x00000000ffffffff - csel x12, x12, x19, ne - csel x13, x13, x20, ne - ldp x14, x15, [resz+16] - mov x19, #1 - csel x14, x14, x19, ne - csel x15, x15, xzr, ne - ldp x16, x17, [resz+32] - csel x16, x16, xzr, ne - csel x17, x17, xzr, ne - - stp x0, x1, [x_3] - stp x2, x3, [x_3+16] - stp x4, x5, [x_3+32] - stp x6, x7, [y_3] - stp x8, x9, [y_3+16] - stp x10, x11, [y_3+32] - stp x12, x13, [z_3] - stp x14, x15, [z_3+16] - stp x16, x17, [z_3+32] - -// Restore stack and registers - - add sp, sp, NSPACE - - ldp x25, x26, [sp], 16 - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits -#endif diff --git a/third_party/s2n-bignum/arm/p384/p384_montjscalarmul.S b/third_party/s2n-bignum/arm/p384/p384_montjscalarmul.S deleted file mode 100644 index 2bd405e2454..00000000000 --- a/third_party/s2n-bignum/arm/p384/p384_montjscalarmul.S +++ /dev/null @@ -1,9988 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Montgomery-Jacobian form scalar multiplication for P-384 -// Input scalar[6], point[18]; output res[18] -// -// extern void p384_montjscalarmul -// (uint64_t res[static 18], -// uint64_t scalar[static 6], -// uint64_t point[static 18]); -// -// This function is a variant of its affine point version p384_scalarmul. -// Here, input and output points are assumed to be in Jacobian form with -// their coordinates in the Montgomery domain. Thus, if priming indicates -// Montgomery form, x' = (2^384 * x) mod p_384 etc., each point argument -// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when -// z' is nonzero or the point at infinity (group identity) if z' = 0. -// -// Given scalar = n and point = P, assumed to be on the NIST elliptic -// curve P-384, returns a representation of n * P. If the result is the -// point at infinity (either because the input point was or because the -// scalar was a multiple of p_384) then the output is guaranteed to -// represent the point at infinity, i.e. to have its z coordinate zero. -// -// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point -// ---------------------------------------------------------------------------- - -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjscalarmul) - S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjscalarmul) - - .text - .balign 4 - -// Size of individual field elements - -#define NUMSIZE 48 -#define JACSIZE (3*NUMSIZE) - -// Safe copies of input res and additional values in variables. - -#define bf x22 -#define sgn x23 -#define j x24 -#define res x25 - -// Intermediate variables on the stack. -// The table is 16 entries, each of size JACSIZE = 3 * NUMSIZE - -#define scalarb sp, #(0*NUMSIZE) -#define acc sp, #(1*NUMSIZE) -#define tabent sp, #(4*NUMSIZE) - -#define tab sp, #(7*NUMSIZE) - -#define NSPACE #(55*NUMSIZE) - -// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, -// which doesn't accept repetitions, assembler macros etc. - -#define selectblock(I) \ - cmp bf, #(1*I); \ - ldp x20, x21, [x19]; \ - csel x0, x20, x0, eq; \ - csel x1, x21, x1, eq; \ - ldp x20, x21, [x19, #16]; \ - csel x2, x20, x2, eq; \ - csel x3, x21, x3, eq; \ - ldp x20, x21, [x19, #32]; \ - csel x4, x20, x4, eq; \ - csel x5, x21, x5, eq; \ - ldp x20, x21, [x19, #48]; \ - csel x6, x20, x6, eq; \ - csel x7, x21, x7, eq; \ - ldp x20, x21, [x19, #64]; \ - csel x8, x20, x8, eq; \ - csel x9, x21, x9, eq; \ - ldp x20, x21, [x19, #80]; \ - csel x10, x20, x10, eq; \ - csel x11, x21, x11, eq; \ - ldp x20, x21, [x19, #96]; \ - csel x12, x20, x12, eq; \ - csel x13, x21, x13, eq; \ - ldp x20, x21, [x19, #112]; \ - csel x14, x20, x14, eq; \ - csel x15, x21, x15, eq; \ - ldp x20, x21, [x19, #128]; \ - csel x16, x20, x16, eq; \ - csel x17, x21, x17, eq; \ - add x19, x19, #JACSIZE - -// Loading large constants - -#define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ - movk nn, n3, lsl #48 - -S2N_BN_SYMBOL(p384_montjscalarmul): - - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - stp x25, x30, [sp, #-16]! - sub sp, sp, NSPACE - -// Preserve the "res" input argument; others get processed early. - - mov res, x0 - -// Reduce the input scalar mod n_384, i.e. conditionally subtract n_384. -// Store it to "scalarb". - - ldp x3, x4, [x1] - movbig(x15, #0xecec, #0x196a, #0xccc5, #0x2973) - ldp x5, x6, [x1, #16] - movbig(x16, #0x581a, #0x0db2, #0x48b0, #0xa77a) - ldp x7, x8, [x1, #32] - movbig(x17, #0xc763, #0x4d81, #0xf437, #0x2ddf) - - subs x9, x3, x15 - sbcs x10, x4, x16 - sbcs x11, x5, x17 - adcs x12, x6, xzr - adcs x13, x7, xzr - adcs x14, x8, xzr - - csel x3, x3, x9, cc - csel x4, x4, x10, cc - csel x5, x5, x11, cc - csel x6, x6, x12, cc - csel x7, x7, x13, cc - csel x8, x8, x14, cc - - stp x3, x4, [scalarb] - stp x5, x6, [scalarb+16] - stp x7, x8, [scalarb+32] - -// Set the tab[0] table entry to the input point = 1 * P - - ldp x10, x11, [x2] - stp x10, x11, [tab] - ldp x12, x13, [x2, #16] - stp x12, x13, [tab+16] - ldp x14, x15, [x2, #32] - stp x14, x15, [tab+32] - - ldp x10, x11, [x2, #48] - stp x10, x11, [tab+48] - ldp x12, x13, [x2, #64] - stp x12, x13, [tab+64] - ldp x14, x15, [x2, #80] - stp x14, x15, [tab+80] - - ldp x10, x11, [x2, #96] - stp x10, x11, [tab+96] - ldp x12, x13, [x2, #112] - stp x12, x13, [tab+112] - ldp x14, x15, [x2, #128] - stp x14, x15, [tab+128] - -// Compute and record tab[1] = 2 * p, ..., tab[15] = 16 * P - - add x0, tab+JACSIZE*1 - add x1, tab - bl p384_montjscalarmul_p384_montjdouble - - add x0, tab+JACSIZE*2 - add x1, tab+JACSIZE*1 - add x2, tab - bl p384_montjscalarmul_p384_montjadd - - add x0, tab+JACSIZE*3 - add x1, tab+JACSIZE*1 - bl p384_montjscalarmul_p384_montjdouble - - add x0, tab+JACSIZE*4 - add x1, tab+JACSIZE*3 - add x2, tab - bl p384_montjscalarmul_p384_montjadd - - add x0, tab+JACSIZE*5 - add x1, tab+JACSIZE*2 - bl p384_montjscalarmul_p384_montjdouble - - add x0, tab+JACSIZE*6 - add x1, tab+JACSIZE*5 - add x2, tab - bl p384_montjscalarmul_p384_montjadd - - add x0, tab+JACSIZE*7 - add x1, tab+JACSIZE*3 - bl p384_montjscalarmul_p384_montjdouble - - add x0, tab+JACSIZE*8 - add x1, tab+JACSIZE*7 - add x2, tab - bl p384_montjscalarmul_p384_montjadd - - add x0, tab+JACSIZE*9 - add x1, tab+JACSIZE*4 - bl p384_montjscalarmul_p384_montjdouble - - add x0, tab+JACSIZE*10 - add x1, tab+JACSIZE*9 - add x2, tab - bl p384_montjscalarmul_p384_montjadd - - add x0, tab+JACSIZE*11 - add x1, tab+JACSIZE*5 - bl p384_montjscalarmul_p384_montjdouble - - add x0, tab+JACSIZE*12 - add x1, tab+JACSIZE*11 - add x2, tab - bl p384_montjscalarmul_p384_montjadd - - add x0, tab+JACSIZE*13 - add x1, tab+JACSIZE*6 - bl p384_montjscalarmul_p384_montjdouble - - add x0, tab+JACSIZE*14 - add x1, tab+JACSIZE*13 - add x2, tab - bl p384_montjscalarmul_p384_montjadd - - add x0, tab+JACSIZE*15 - add x1, tab+JACSIZE*7 - bl p384_montjscalarmul_p384_montjdouble - -// Add the recoding constant sum_i(16 * 32^i) to the scalar to allow signed -// digits. The digits of the constant, in lowest-to-highest order, are as -// follows; they are generated dynamically since none is a simple ARM load. -// -// 0x0842108421084210 -// 0x1084210842108421 -// 0x2108421084210842 -// 0x4210842108421084 -// 0x8421084210842108 -// 0x0842108421084210 - - ldp x0, x1, [scalarb] - ldp x2, x3, [scalarb+16] - ldp x4, x5, [scalarb+32] - movbig(x8, #0x1084, #0x2108, #0x4210, #0x8421) - adds x0, x0, x8, lsr #1 - adcs x1, x1, x8 - lsl x8, x8, #1 - adcs x2, x2, x8 - lsl x8, x8, #1 - adcs x3, x3, x8 - lsl x8, x8, #1 - adcs x4, x4, x8 - lsr x8, x8, #4 - adcs x5, x5, x8 - cset x6, cs - -// Record the top bitfield then shift the whole scalar left 4 bits -// to align the top of the next bitfield with the MSB (bits 379..383). - - extr bf, x6, x5, #60 - extr x5, x5, x4, #60 - extr x4, x4, x3, #60 - extr x3, x3, x2, #60 - extr x2, x2, x1, #60 - extr x1, x1, x0, #60 - lsl x0, x0, #4 - stp x0, x1, [scalarb] - stp x2, x3, [scalarb+16] - stp x4, x5, [scalarb+32] - -// Initialize the accumulator to the corresponding entry using constant-time -// lookup in the table. This top digit, uniquely, is not recoded so there is -// no sign adjustment to make. - - mov x0, xzr - mov x1, xzr - mov x2, xzr - mov x3, xzr - mov x4, xzr - mov x5, xzr - mov x6, xzr - mov x7, xzr - mov x8, xzr - mov x9, xzr - mov x10, xzr - mov x11, xzr - mov x12, xzr - mov x13, xzr - mov x14, xzr - mov x15, xzr - mov x16, xzr - mov x17, xzr - - add x19, tab - - selectblock(1) - selectblock(2) - selectblock(3) - selectblock(4) - selectblock(5) - selectblock(6) - selectblock(7) - selectblock(8) - selectblock(9) - selectblock(10) - selectblock(11) - selectblock(12) - selectblock(13) - selectblock(14) - selectblock(15) - selectblock(16) - - stp x0, x1, [acc] - stp x2, x3, [acc+16] - stp x4, x5, [acc+32] - stp x6, x7, [acc+48] - stp x8, x9, [acc+64] - stp x10, x11, [acc+80] - stp x12, x13, [acc+96] - stp x14, x15, [acc+112] - stp x16, x17, [acc+128] - - mov j, #380 - -// Main loop over size-5 bitfields: double 5 times then add signed digit -// At each stage we shift the scalar left by 5 bits so we can simply pick -// the top 5 bits as the bitfield, saving some fiddle over indexing. - -p384_montjscalarmul_mainloop: - sub j, j, #5 - - add x0, acc - add x1, acc - bl p384_montjscalarmul_p384_montjdouble - - add x0, acc - add x1, acc - bl p384_montjscalarmul_p384_montjdouble - - add x0, acc - add x1, acc - bl p384_montjscalarmul_p384_montjdouble - - add x0, acc - add x1, acc - bl p384_montjscalarmul_p384_montjdouble - - add x0, acc - add x1, acc - bl p384_montjscalarmul_p384_montjdouble - -// Choose the bitfield and adjust it to sign and magnitude - - ldp x0, x1, [scalarb] - ldp x2, x3, [scalarb+16] - ldp x4, x5, [scalarb+32] - lsr bf, x5, #59 - extr x5, x5, x4, #59 - extr x4, x4, x3, #59 - extr x3, x3, x2, #59 - extr x2, x2, x1, #59 - extr x1, x1, x0, #59 - lsl x0, x0, #5 - stp x0, x1, [scalarb] - stp x2, x3, [scalarb+16] - stp x4, x5, [scalarb+32] - - subs bf, bf, #16 - cset sgn, lo // sgn = sign of digit (1 = negative) - cneg bf, bf, lo // bf = absolute value of digit - -// Conditionally select the table entry tab[i-1] = i * P in constant time - - mov x0, xzr - mov x1, xzr - mov x2, xzr - mov x3, xzr - mov x4, xzr - mov x5, xzr - mov x6, xzr - mov x7, xzr - mov x8, xzr - mov x9, xzr - mov x10, xzr - mov x11, xzr - mov x12, xzr - mov x13, xzr - mov x14, xzr - mov x15, xzr - mov x16, xzr - mov x17, xzr - - add x19, tab - - selectblock(1) - selectblock(2) - selectblock(3) - selectblock(4) - selectblock(5) - selectblock(6) - selectblock(7) - selectblock(8) - selectblock(9) - selectblock(10) - selectblock(11) - selectblock(12) - selectblock(13) - selectblock(14) - selectblock(15) - selectblock(16) - -// Store it to "tabent" with the y coordinate optionally negated. -// This is done carefully to give coordinates < p_384 even in -// the degenerate case y = 0 (when z = 0 for points on the curve). - - stp x0, x1, [tabent] - stp x2, x3, [tabent+16] - stp x4, x5, [tabent+32] - - stp x12, x13, [tabent+96] - stp x14, x15, [tabent+112] - stp x16, x17, [tabent+128] - - mov x0, #0x00000000ffffffff - subs x0, x0, x6 - orr x12, x6, x7 - mov x1, #0xffffffff00000000 - sbcs x1, x1, x7 - orr x13, x8, x9 - mov x2, #0xfffffffffffffffe - sbcs x2, x2, x8 - orr x14, x10, x11 - mov x5, #0xffffffffffffffff - sbcs x3, x5, x9 - orr x12, x12, x13 - sbcs x4, x5, x10 - orr x12, x12, x14 - sbcs x5, x5, x11 - - cmp sgn, xzr - ccmp x12, xzr, #4, ne - - csel x6, x0, x6, ne - csel x7, x1, x7, ne - csel x8, x2, x8, ne - csel x9, x3, x9, ne - csel x10, x4, x10, ne - csel x11, x5, x11, ne - - stp x6, x7, [tabent+48] - stp x8, x9, [tabent+64] - stp x10, x11, [tabent+80] - -// Add to the accumulator - - add x0, acc - add x1, acc - add x2, tabent - bl p384_montjscalarmul_p384_montjadd - - cbnz j, p384_montjscalarmul_mainloop - -// That's the end of the main loop, and we just need to copy the -// result in "acc" to the output. - - ldp x0, x1, [acc] - stp x0, x1, [res] - ldp x0, x1, [acc+16] - stp x0, x1, [res, #16] - ldp x0, x1, [acc+32] - stp x0, x1, [res, #32] - ldp x0, x1, [acc+48] - stp x0, x1, [res, #48] - ldp x0, x1, [acc+64] - stp x0, x1, [res, #64] - ldp x0, x1, [acc+80] - stp x0, x1, [res, #80] - ldp x0, x1, [acc+96] - stp x0, x1, [res, #96] - ldp x0, x1, [acc+112] - stp x0, x1, [res, #112] - ldp x0, x1, [acc+128] - stp x0, x1, [res, #128] - -// Restore stack and registers and return - - add sp, sp, NSPACE - ldp x25, x30, [sp], 16 - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - ret - -// Local copies of subroutines, complete clones at the moment - -p384_montjscalarmul_p384_montjadd: - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - stp x25, x26, [sp, #-16]! - stp x27, xzr, [sp, #-16]! - sub sp, sp, #0x180 - mov x24, x0 - mov x25, x1 - mov x26, x2 - mov x0, sp - ldr q1, [x25, #96] - ldp x9, x2, [x25, #96] - ldr q0, [x25, #96] - ldp x4, x6, [x25, #112] - rev64 v21.4s, v1.4s - uzp2 v28.4s, v1.4s, v1.4s - umulh x7, x9, x2 - xtn v17.2s, v1.2d - mul v27.4s, v21.4s, v0.4s - ldr q20, [x25, #128] - xtn v30.2s, v0.2d - ldr q1, [x25, #128] - uzp2 v31.4s, v0.4s, v0.4s - ldp x5, x10, [x25, #128] - umulh x8, x9, x4 - uaddlp v3.2d, v27.4s - umull v16.2d, v30.2s, v17.2s - mul x16, x9, x4 - umull v27.2d, v30.2s, v28.2s - shrn v0.2s, v20.2d, #32 - xtn v7.2s, v20.2d - shl v20.2d, v3.2d, #32 - umull v3.2d, v31.2s, v28.2s - mul x3, x2, x4 - umlal v20.2d, v30.2s, v17.2s - umull v22.2d, v7.2s, v0.2s - usra v27.2d, v16.2d, #32 - umulh x11, x2, x4 - movi v21.2d, #0xffffffff - uzp2 v28.4s, v1.4s, v1.4s - adds x15, x16, x7 - and v5.16b, v27.16b, v21.16b - adcs x3, x3, x8 - usra v3.2d, v27.2d, #32 - dup v29.2d, x6 - adcs x16, x11, xzr - mov x14, v20.d[0] - umlal v5.2d, v31.2s, v17.2s - mul x8, x9, x2 - mov x7, v20.d[1] - shl v19.2d, v22.2d, #33 - xtn v25.2s, v29.2d - rev64 v31.4s, v1.4s - lsl x13, x14, #32 - uzp2 v6.4s, v29.4s, v29.4s - umlal v19.2d, v7.2s, v7.2s - usra v3.2d, v5.2d, #32 - adds x1, x8, x8 - umulh x8, x4, x4 - add x12, x13, x14 - mul v17.4s, v31.4s, v29.4s - xtn v4.2s, v1.2d - adcs x14, x15, x15 - lsr x13, x12, #32 - adcs x15, x3, x3 - umull v31.2d, v25.2s, v28.2s - adcs x11, x16, x16 - umull v21.2d, v25.2s, v4.2s - mov x17, v3.d[0] - umull v18.2d, v6.2s, v28.2s - adc x16, x8, xzr - uaddlp v16.2d, v17.4s - movi v1.2d, #0xffffffff - subs x13, x13, x12 - usra v31.2d, v21.2d, #32 - sbc x8, x12, xzr - adds x17, x17, x1 - mul x1, x4, x4 - shl v28.2d, v16.2d, #32 - mov x3, v3.d[1] - adcs x14, x7, x14 - extr x7, x8, x13, #32 - adcs x13, x3, x15 - and v3.16b, v31.16b, v1.16b - adcs x11, x1, x11 - lsr x1, x8, #32 - umlal v3.2d, v6.2s, v4.2s - usra v18.2d, v31.2d, #32 - adc x3, x16, xzr - adds x1, x1, x12 - umlal v28.2d, v25.2s, v4.2s - adc x16, xzr, xzr - subs x15, x17, x7 - sbcs x7, x14, x1 - lsl x1, x15, #32 - sbcs x16, x13, x16 - add x8, x1, x15 - usra v18.2d, v3.2d, #32 - sbcs x14, x11, xzr - lsr x1, x8, #32 - sbcs x17, x3, xzr - sbc x11, x12, xzr - subs x13, x1, x8 - umulh x12, x4, x10 - sbc x1, x8, xzr - extr x13, x1, x13, #32 - lsr x1, x1, #32 - adds x15, x1, x8 - adc x1, xzr, xzr - subs x7, x7, x13 - sbcs x13, x16, x15 - lsl x3, x7, #32 - umulh x16, x2, x5 - sbcs x15, x14, x1 - add x7, x3, x7 - sbcs x3, x17, xzr - lsr x1, x7, #32 - sbcs x14, x11, xzr - sbc x11, x8, xzr - subs x8, x1, x7 - sbc x1, x7, xzr - extr x8, x1, x8, #32 - lsr x1, x1, #32 - adds x1, x1, x7 - adc x17, xzr, xzr - subs x13, x13, x8 - umulh x8, x9, x6 - sbcs x1, x15, x1 - sbcs x15, x3, x17 - sbcs x3, x14, xzr - mul x17, x2, x5 - sbcs x11, x11, xzr - stp x13, x1, [x0] - sbc x14, x7, xzr - mul x7, x4, x10 - subs x1, x9, x2 - stp x15, x3, [x0, #16] - csetm x15, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - stp x11, x14, [x0, #32] - mul x14, x9, x6 - adds x17, x8, x17 - adcs x7, x16, x7 - adc x13, x12, xzr - subs x12, x5, x6 - cneg x3, x12, cc // cc = lo, ul, last - cinv x16, x15, cc // cc = lo, ul, last - mul x8, x1, x3 - umulh x1, x1, x3 - eor x12, x8, x16 - adds x11, x17, x14 - adcs x3, x7, x17 - adcs x15, x13, x7 - adc x8, x13, xzr - adds x3, x3, x14 - adcs x15, x15, x17 - adcs x17, x8, x7 - eor x1, x1, x16 - adc x13, x13, xzr - subs x9, x9, x4 - csetm x8, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x4, x2, x4 - cneg x4, x4, cc // cc = lo, ul, last - csetm x7, cc // cc = lo, ul, last - subs x2, x10, x6 - cinv x8, x8, cc // cc = lo, ul, last - cneg x2, x2, cc // cc = lo, ul, last - cmn x16, #0x1 - adcs x11, x11, x12 - mul x12, x9, x2 - adcs x3, x3, x1 - adcs x15, x15, x16 - umulh x9, x9, x2 - adcs x17, x17, x16 - adc x13, x13, x16 - subs x1, x10, x5 - cinv x2, x7, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - eor x9, x9, x8 - cmn x8, #0x1 - eor x7, x12, x8 - mul x12, x4, x1 - adcs x3, x3, x7 - adcs x7, x15, x9 - adcs x15, x17, x8 - ldp x9, x17, [x0, #16] - umulh x4, x4, x1 - adc x8, x13, x8 - cmn x2, #0x1 - eor x1, x12, x2 - adcs x1, x7, x1 - ldp x7, x16, [x0] - eor x12, x4, x2 - adcs x4, x15, x12 - ldp x15, x12, [x0, #32] - adc x8, x8, x2 - adds x13, x14, x14 - umulh x14, x5, x10 - adcs x2, x11, x11 - adcs x3, x3, x3 - adcs x1, x1, x1 - adcs x4, x4, x4 - adcs x11, x8, x8 - adc x8, xzr, xzr - adds x13, x13, x7 - adcs x2, x2, x16 - mul x16, x5, x10 - adcs x3, x3, x9 - adcs x1, x1, x17 - umulh x5, x5, x5 - lsl x9, x13, #32 - add x9, x9, x13 - adcs x4, x4, x15 - mov x13, v28.d[1] - adcs x15, x11, x12 - lsr x7, x9, #32 - adc x11, x8, xzr - subs x7, x7, x9 - umulh x10, x10, x10 - sbc x17, x9, xzr - extr x7, x17, x7, #32 - lsr x17, x17, #32 - adds x17, x17, x9 - adc x12, xzr, xzr - subs x8, x2, x7 - sbcs x17, x3, x17 - lsl x7, x8, #32 - sbcs x2, x1, x12 - add x3, x7, x8 - sbcs x12, x4, xzr - lsr x1, x3, #32 - sbcs x7, x15, xzr - sbc x15, x9, xzr - subs x1, x1, x3 - sbc x4, x3, xzr - lsr x9, x4, #32 - extr x8, x4, x1, #32 - adds x9, x9, x3 - adc x4, xzr, xzr - subs x1, x17, x8 - lsl x17, x1, #32 - sbcs x8, x2, x9 - sbcs x9, x12, x4 - add x17, x17, x1 - mov x1, v18.d[1] - lsr x2, x17, #32 - sbcs x7, x7, xzr - mov x12, v18.d[0] - sbcs x15, x15, xzr - sbc x3, x3, xzr - subs x4, x2, x17 - sbc x2, x17, xzr - adds x12, x13, x12 - adcs x16, x16, x1 - lsr x13, x2, #32 - extr x1, x2, x4, #32 - adc x2, x14, xzr - adds x4, x13, x17 - mul x13, x6, x6 - adc x14, xzr, xzr - subs x1, x8, x1 - sbcs x4, x9, x4 - mov x9, v28.d[0] - sbcs x7, x7, x14 - sbcs x8, x15, xzr - sbcs x3, x3, xzr - sbc x14, x17, xzr - adds x17, x9, x9 - adcs x12, x12, x12 - mov x15, v19.d[0] - adcs x9, x16, x16 - umulh x6, x6, x6 - adcs x16, x2, x2 - adc x2, xzr, xzr - adds x11, x11, x8 - adcs x3, x3, xzr - adcs x14, x14, xzr - adcs x8, xzr, xzr - adds x13, x1, x13 - mov x1, v19.d[1] - adcs x6, x4, x6 - mov x4, #0xffffffff // #4294967295 - adcs x15, x7, x15 - adcs x7, x11, x5 - adcs x1, x3, x1 - adcs x14, x14, x10 - adc x11, x8, xzr - adds x6, x6, x17 - adcs x8, x15, x12 - adcs x3, x7, x9 - adcs x15, x1, x16 - mov x16, #0xffffffff00000001 // #-4294967295 - adcs x14, x14, x2 - mov x2, #0x1 // #1 - adc x17, x11, xzr - cmn x13, x16 - adcs xzr, x6, x4 - adcs xzr, x8, x2 - adcs xzr, x3, xzr - adcs xzr, x15, xzr - adcs xzr, x14, xzr - adc x1, x17, xzr - neg x9, x1 - and x1, x16, x9 - adds x11, x13, x1 - and x13, x4, x9 - adcs x5, x6, x13 - and x1, x2, x9 - adcs x7, x8, x1 - stp x11, x5, [x0] - adcs x11, x3, xzr - adcs x2, x15, xzr - stp x7, x11, [x0, #16] - adc x17, x14, xzr - stp x2, x17, [x0, #32] - ldr q1, [x26, #96] - ldp x9, x2, [x26, #96] - ldr q0, [x26, #96] - ldp x4, x6, [x26, #112] - rev64 v21.4s, v1.4s - uzp2 v28.4s, v1.4s, v1.4s - umulh x7, x9, x2 - xtn v17.2s, v1.2d - mul v27.4s, v21.4s, v0.4s - ldr q20, [x26, #128] - xtn v30.2s, v0.2d - ldr q1, [x26, #128] - uzp2 v31.4s, v0.4s, v0.4s - ldp x5, x10, [x26, #128] - umulh x8, x9, x4 - uaddlp v3.2d, v27.4s - umull v16.2d, v30.2s, v17.2s - mul x16, x9, x4 - umull v27.2d, v30.2s, v28.2s - shrn v0.2s, v20.2d, #32 - xtn v7.2s, v20.2d - shl v20.2d, v3.2d, #32 - umull v3.2d, v31.2s, v28.2s - mul x3, x2, x4 - umlal v20.2d, v30.2s, v17.2s - umull v22.2d, v7.2s, v0.2s - usra v27.2d, v16.2d, #32 - umulh x11, x2, x4 - movi v21.2d, #0xffffffff - uzp2 v28.4s, v1.4s, v1.4s - adds x15, x16, x7 - and v5.16b, v27.16b, v21.16b - adcs x3, x3, x8 - usra v3.2d, v27.2d, #32 - dup v29.2d, x6 - adcs x16, x11, xzr - mov x14, v20.d[0] - umlal v5.2d, v31.2s, v17.2s - mul x8, x9, x2 - mov x7, v20.d[1] - shl v19.2d, v22.2d, #33 - xtn v25.2s, v29.2d - rev64 v31.4s, v1.4s - lsl x13, x14, #32 - uzp2 v6.4s, v29.4s, v29.4s - umlal v19.2d, v7.2s, v7.2s - usra v3.2d, v5.2d, #32 - adds x1, x8, x8 - umulh x8, x4, x4 - add x12, x13, x14 - mul v17.4s, v31.4s, v29.4s - xtn v4.2s, v1.2d - adcs x14, x15, x15 - lsr x13, x12, #32 - adcs x15, x3, x3 - umull v31.2d, v25.2s, v28.2s - adcs x11, x16, x16 - umull v21.2d, v25.2s, v4.2s - mov x17, v3.d[0] - umull v18.2d, v6.2s, v28.2s - adc x16, x8, xzr - uaddlp v16.2d, v17.4s - movi v1.2d, #0xffffffff - subs x13, x13, x12 - usra v31.2d, v21.2d, #32 - sbc x8, x12, xzr - adds x17, x17, x1 - mul x1, x4, x4 - shl v28.2d, v16.2d, #32 - mov x3, v3.d[1] - adcs x14, x7, x14 - extr x7, x8, x13, #32 - adcs x13, x3, x15 - and v3.16b, v31.16b, v1.16b - adcs x11, x1, x11 - lsr x1, x8, #32 - umlal v3.2d, v6.2s, v4.2s - usra v18.2d, v31.2d, #32 - adc x3, x16, xzr - adds x1, x1, x12 - umlal v28.2d, v25.2s, v4.2s - adc x16, xzr, xzr - subs x15, x17, x7 - sbcs x7, x14, x1 - lsl x1, x15, #32 - sbcs x16, x13, x16 - add x8, x1, x15 - usra v18.2d, v3.2d, #32 - sbcs x14, x11, xzr - lsr x1, x8, #32 - sbcs x17, x3, xzr - sbc x11, x12, xzr - subs x13, x1, x8 - umulh x12, x4, x10 - sbc x1, x8, xzr - extr x13, x1, x13, #32 - lsr x1, x1, #32 - adds x15, x1, x8 - adc x1, xzr, xzr - subs x7, x7, x13 - sbcs x13, x16, x15 - lsl x3, x7, #32 - umulh x16, x2, x5 - sbcs x15, x14, x1 - add x7, x3, x7 - sbcs x3, x17, xzr - lsr x1, x7, #32 - sbcs x14, x11, xzr - sbc x11, x8, xzr - subs x8, x1, x7 - sbc x1, x7, xzr - extr x8, x1, x8, #32 - lsr x1, x1, #32 - adds x1, x1, x7 - adc x17, xzr, xzr - subs x13, x13, x8 - umulh x8, x9, x6 - sbcs x1, x15, x1 - sbcs x15, x3, x17 - sbcs x3, x14, xzr - mul x17, x2, x5 - sbcs x11, x11, xzr - stp x13, x1, [sp, #240] - sbc x14, x7, xzr - mul x7, x4, x10 - subs x1, x9, x2 - stp x15, x3, [sp, #256] - csetm x15, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - stp x11, x14, [sp, #272] - mul x14, x9, x6 - adds x17, x8, x17 - adcs x7, x16, x7 - adc x13, x12, xzr - subs x12, x5, x6 - cneg x3, x12, cc // cc = lo, ul, last - cinv x16, x15, cc // cc = lo, ul, last - mul x8, x1, x3 - umulh x1, x1, x3 - eor x12, x8, x16 - adds x11, x17, x14 - adcs x3, x7, x17 - adcs x15, x13, x7 - adc x8, x13, xzr - adds x3, x3, x14 - adcs x15, x15, x17 - adcs x17, x8, x7 - eor x1, x1, x16 - adc x13, x13, xzr - subs x9, x9, x4 - csetm x8, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x4, x2, x4 - cneg x4, x4, cc // cc = lo, ul, last - csetm x7, cc // cc = lo, ul, last - subs x2, x10, x6 - cinv x8, x8, cc // cc = lo, ul, last - cneg x2, x2, cc // cc = lo, ul, last - cmn x16, #0x1 - adcs x11, x11, x12 - mul x12, x9, x2 - adcs x3, x3, x1 - adcs x15, x15, x16 - umulh x9, x9, x2 - adcs x17, x17, x16 - adc x13, x13, x16 - subs x1, x10, x5 - cinv x2, x7, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - eor x9, x9, x8 - cmn x8, #0x1 - eor x7, x12, x8 - mul x12, x4, x1 - adcs x3, x3, x7 - adcs x7, x15, x9 - adcs x15, x17, x8 - ldp x9, x17, [sp, #256] - umulh x4, x4, x1 - adc x8, x13, x8 - cmn x2, #0x1 - eor x1, x12, x2 - adcs x1, x7, x1 - ldp x7, x16, [sp, #240] - eor x12, x4, x2 - adcs x4, x15, x12 - ldp x15, x12, [sp, #272] - adc x8, x8, x2 - adds x13, x14, x14 - umulh x14, x5, x10 - adcs x2, x11, x11 - adcs x3, x3, x3 - adcs x1, x1, x1 - adcs x4, x4, x4 - adcs x11, x8, x8 - adc x8, xzr, xzr - adds x13, x13, x7 - adcs x2, x2, x16 - mul x16, x5, x10 - adcs x3, x3, x9 - adcs x1, x1, x17 - umulh x5, x5, x5 - lsl x9, x13, #32 - add x9, x9, x13 - adcs x4, x4, x15 - mov x13, v28.d[1] - adcs x15, x11, x12 - lsr x7, x9, #32 - adc x11, x8, xzr - subs x7, x7, x9 - umulh x10, x10, x10 - sbc x17, x9, xzr - extr x7, x17, x7, #32 - lsr x17, x17, #32 - adds x17, x17, x9 - adc x12, xzr, xzr - subs x8, x2, x7 - sbcs x17, x3, x17 - lsl x7, x8, #32 - sbcs x2, x1, x12 - add x3, x7, x8 - sbcs x12, x4, xzr - lsr x1, x3, #32 - sbcs x7, x15, xzr - sbc x15, x9, xzr - subs x1, x1, x3 - sbc x4, x3, xzr - lsr x9, x4, #32 - extr x8, x4, x1, #32 - adds x9, x9, x3 - adc x4, xzr, xzr - subs x1, x17, x8 - lsl x17, x1, #32 - sbcs x8, x2, x9 - sbcs x9, x12, x4 - add x17, x17, x1 - mov x1, v18.d[1] - lsr x2, x17, #32 - sbcs x7, x7, xzr - mov x12, v18.d[0] - sbcs x15, x15, xzr - sbc x3, x3, xzr - subs x4, x2, x17 - sbc x2, x17, xzr - adds x12, x13, x12 - adcs x16, x16, x1 - lsr x13, x2, #32 - extr x1, x2, x4, #32 - adc x2, x14, xzr - adds x4, x13, x17 - mul x13, x6, x6 - adc x14, xzr, xzr - subs x1, x8, x1 - sbcs x4, x9, x4 - mov x9, v28.d[0] - sbcs x7, x7, x14 - sbcs x8, x15, xzr - sbcs x3, x3, xzr - sbc x14, x17, xzr - adds x17, x9, x9 - adcs x12, x12, x12 - mov x15, v19.d[0] - adcs x9, x16, x16 - umulh x6, x6, x6 - adcs x16, x2, x2 - adc x2, xzr, xzr - adds x11, x11, x8 - adcs x3, x3, xzr - adcs x14, x14, xzr - adcs x8, xzr, xzr - adds x13, x1, x13 - mov x1, v19.d[1] - adcs x6, x4, x6 - mov x4, #0xffffffff // #4294967295 - adcs x15, x7, x15 - adcs x7, x11, x5 - adcs x1, x3, x1 - adcs x14, x14, x10 - adc x11, x8, xzr - adds x6, x6, x17 - adcs x8, x15, x12 - adcs x3, x7, x9 - adcs x15, x1, x16 - mov x16, #0xffffffff00000001 // #-4294967295 - adcs x14, x14, x2 - mov x2, #0x1 // #1 - adc x17, x11, xzr - cmn x13, x16 - adcs xzr, x6, x4 - adcs xzr, x8, x2 - adcs xzr, x3, xzr - adcs xzr, x15, xzr - adcs xzr, x14, xzr - adc x1, x17, xzr - neg x9, x1 - and x1, x16, x9 - adds x11, x13, x1 - and x13, x4, x9 - adcs x5, x6, x13 - and x1, x2, x9 - adcs x7, x8, x1 - stp x11, x5, [sp, #240] - adcs x11, x3, xzr - adcs x2, x15, xzr - stp x7, x11, [sp, #256] - adc x17, x14, xzr - stp x2, x17, [sp, #272] - stp x23, x24, [sp, #0x150] - ldr q3, [x26, #96] - ldr q25, [x25, #48] - ldp x13, x23, [x25, #48] - ldp x3, x21, [x26, #96] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [x25, #80] - ldp x8, x24, [x26, #112] - subs x6, x3, x21 - ldr q0, [x26, #128] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [x25, #64] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [x25, #80] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [x26, #128] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #288] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #304] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #320] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #288] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #304] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #320] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #288] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #304] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #320] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #288] - ldp x21, x12, [sp, #304] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #320] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x14, x24, x11 - stp x22, x5, [sp, #288] - adcs x5, x13, x23 - adcs x21, x8, x23 - stp x14, x5, [sp, #304] - adc x12, x15, x23 - stp x21, x12, [sp, #320] - ldr q3, [x25, #96] - ldr q25, [x26, #48] - ldp x13, x23, [x26, #48] - ldp x3, x21, [x25, #96] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [x26, #80] - ldp x8, x24, [x25, #112] - subs x6, x3, x21 - ldr q0, [x25, #128] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [x26, #64] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [x26, #80] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [x25, #128] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #48] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #64] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #80] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #48] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #64] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #80] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #48] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #64] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #80] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #48] - ldp x21, x12, [sp, #64] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #80] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x14, x24, x11 - stp x22, x5, [sp, #48] - adcs x5, x13, x23 - adcs x21, x8, x23 - stp x14, x5, [sp, #64] - adc x12, x15, x23 - stp x21, x12, [sp, #80] - mov x1, sp - ldr q3, [x1] - ldr q25, [x26] - ldp x13, x23, [x26] - ldp x3, x21, [x1] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [x26, #32] - ldp x8, x24, [x1, #16] - subs x6, x3, x21 - ldr q0, [x1, #32] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [x26, #16] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [x26, #32] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [x1, #32] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #96] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #112] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #128] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #96] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #112] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #128] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #96] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #112] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #128] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #96] - ldp x21, x12, [sp, #112] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #128] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x14, x24, x11 - stp x22, x5, [sp, #96] - adcs x5, x13, x23 - adcs x21, x8, x23 - stp x14, x5, [sp, #112] - adc x12, x15, x23 - stp x21, x12, [sp, #128] - ldr q3, [sp, #240] - ldr q25, [x25] - ldp x13, x23, [x25] - ldp x3, x21, [sp, #240] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [x25, #32] - ldp x8, x24, [sp, #256] - subs x6, x3, x21 - ldr q0, [sp, #272] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [x25, #16] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [x25, #32] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [sp, #272] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #192] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #208] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #224] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #192] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #208] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #224] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #192] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #208] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #224] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #192] - ldp x21, x12, [sp, #208] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #224] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x14, x24, x11 - stp x22, x5, [sp, #192] - adcs x5, x13, x23 - adcs x21, x8, x23 - stp x14, x5, [sp, #208] - adc x12, x15, x23 - stp x21, x12, [sp, #224] - mov x1, sp - ldr q3, [x1] - ldr q25, [sp, #48] - ldp x13, x23, [sp, #48] - ldp x3, x21, [x1] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [sp, #80] - ldp x8, x24, [x1, #16] - subs x6, x3, x21 - ldr q0, [x1, #32] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [sp, #64] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [sp, #80] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [x1, #32] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #48] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #64] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #80] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #48] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #64] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #80] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #48] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #64] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #80] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #48] - ldp x21, x12, [sp, #64] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #80] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x14, x24, x11 - stp x22, x5, [sp, #48] - adcs x5, x13, x23 - adcs x21, x8, x23 - stp x14, x5, [sp, #64] - adc x12, x15, x23 - stp x21, x12, [sp, #80] - ldr q3, [sp, #240] - ldr q25, [sp, #288] - ldp x13, x23, [sp, #288] - ldp x3, x21, [sp, #240] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [sp, #320] - ldp x8, x24, [sp, #256] - subs x6, x3, x21 - ldr q0, [sp, #272] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [sp, #304] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [sp, #320] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [sp, #272] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #288] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #304] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #320] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #288] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #304] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #320] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #288] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #304] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #320] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #288] - ldp x21, x12, [sp, #304] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #320] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x2, x24, x11 - stp x22, x5, [sp, #288] - adcs x11, x13, x23 - adcs x12, x8, x23 - stp x2, x11, [sp, #304] - adc x13, x15, x23 - stp x12, x13, [sp, #320] - ldp x5, x6, [sp, #96] - ldp x4, x3, [sp, #192] - subs x5, x5, x4 - sbcs x6, x6, x3 - ldp x7, x8, [sp, #112] - ldp x4, x3, [sp, #208] - sbcs x7, x7, x4 - sbcs x8, x8, x3 - ldp x9, x10, [sp, #128] - ldp x4, x3, [sp, #224] - sbcs x9, x9, x4 - sbcs x10, x10, x3 - csetm x3, cc // cc = lo, ul, last - mov x4, #0xffffffff // #4294967295 - and x4, x4, x3 - adds x5, x5, x4 - eor x4, x4, x3 - adcs x6, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - and x4, x4, x3 - adcs x7, x7, x4 - adcs x8, x8, x3 - adcs x9, x9, x3 - adc x10, x10, x3 - stp x5, x6, [sp, #240] - stp x7, x8, [sp, #256] - stp x9, x10, [sp, #272] - ldp x5, x6, [sp, #48] - ldp x4, x3, [sp, #288] - subs x5, x5, x4 - sbcs x6, x6, x3 - ldp x7, x8, [sp, #64] - sbcs x7, x7, x2 - sbcs x8, x8, x11 - ldp x9, x10, [sp, #80] - sbcs x9, x9, x12 - sbcs x10, x10, x13 - csetm x3, cc // cc = lo, ul, last - mov x4, #0xffffffff // #4294967295 - and x4, x4, x3 - adds x5, x5, x4 - eor x4, x4, x3 - adcs x6, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - and x4, x4, x3 - adcs x7, x7, x4 - adcs x8, x8, x3 - adcs x9, x9, x3 - adc x10, x10, x3 - stp x5, x6, [sp, #48] - stp x7, x8, [sp, #64] - stp x9, x10, [sp, #80] - ldr q1, [sp, #240] - ldp x9, x2, [sp, #240] - ldr q0, [sp, #240] - ldp x4, x6, [sp, #256] - rev64 v21.4s, v1.4s - uzp2 v28.4s, v1.4s, v1.4s - umulh x7, x9, x2 - xtn v17.2s, v1.2d - mul v27.4s, v21.4s, v0.4s - ldr q20, [sp, #272] - xtn v30.2s, v0.2d - ldr q1, [sp, #272] - uzp2 v31.4s, v0.4s, v0.4s - ldp x5, x10, [sp, #272] - umulh x8, x9, x4 - uaddlp v3.2d, v27.4s - umull v16.2d, v30.2s, v17.2s - mul x16, x9, x4 - umull v27.2d, v30.2s, v28.2s - shrn v0.2s, v20.2d, #32 - xtn v7.2s, v20.2d - shl v20.2d, v3.2d, #32 - umull v3.2d, v31.2s, v28.2s - mul x3, x2, x4 - umlal v20.2d, v30.2s, v17.2s - umull v22.2d, v7.2s, v0.2s - usra v27.2d, v16.2d, #32 - umulh x11, x2, x4 - movi v21.2d, #0xffffffff - uzp2 v28.4s, v1.4s, v1.4s - adds x15, x16, x7 - and v5.16b, v27.16b, v21.16b - adcs x3, x3, x8 - usra v3.2d, v27.2d, #32 - dup v29.2d, x6 - adcs x16, x11, xzr - mov x14, v20.d[0] - umlal v5.2d, v31.2s, v17.2s - mul x8, x9, x2 - mov x7, v20.d[1] - shl v19.2d, v22.2d, #33 - xtn v25.2s, v29.2d - rev64 v31.4s, v1.4s - lsl x13, x14, #32 - uzp2 v6.4s, v29.4s, v29.4s - umlal v19.2d, v7.2s, v7.2s - usra v3.2d, v5.2d, #32 - adds x1, x8, x8 - umulh x8, x4, x4 - add x12, x13, x14 - mul v17.4s, v31.4s, v29.4s - xtn v4.2s, v1.2d - adcs x14, x15, x15 - lsr x13, x12, #32 - adcs x15, x3, x3 - umull v31.2d, v25.2s, v28.2s - adcs x11, x16, x16 - umull v21.2d, v25.2s, v4.2s - mov x17, v3.d[0] - umull v18.2d, v6.2s, v28.2s - adc x16, x8, xzr - uaddlp v16.2d, v17.4s - movi v1.2d, #0xffffffff - subs x13, x13, x12 - usra v31.2d, v21.2d, #32 - sbc x8, x12, xzr - adds x17, x17, x1 - mul x1, x4, x4 - shl v28.2d, v16.2d, #32 - mov x3, v3.d[1] - adcs x14, x7, x14 - extr x7, x8, x13, #32 - adcs x13, x3, x15 - and v3.16b, v31.16b, v1.16b - adcs x11, x1, x11 - lsr x1, x8, #32 - umlal v3.2d, v6.2s, v4.2s - usra v18.2d, v31.2d, #32 - adc x3, x16, xzr - adds x1, x1, x12 - umlal v28.2d, v25.2s, v4.2s - adc x16, xzr, xzr - subs x15, x17, x7 - sbcs x7, x14, x1 - lsl x1, x15, #32 - sbcs x16, x13, x16 - add x8, x1, x15 - usra v18.2d, v3.2d, #32 - sbcs x14, x11, xzr - lsr x1, x8, #32 - sbcs x17, x3, xzr - sbc x11, x12, xzr - subs x13, x1, x8 - umulh x12, x4, x10 - sbc x1, x8, xzr - extr x13, x1, x13, #32 - lsr x1, x1, #32 - adds x15, x1, x8 - adc x1, xzr, xzr - subs x7, x7, x13 - sbcs x13, x16, x15 - lsl x3, x7, #32 - umulh x16, x2, x5 - sbcs x15, x14, x1 - add x7, x3, x7 - sbcs x3, x17, xzr - lsr x1, x7, #32 - sbcs x14, x11, xzr - sbc x11, x8, xzr - subs x8, x1, x7 - sbc x1, x7, xzr - extr x8, x1, x8, #32 - lsr x1, x1, #32 - adds x1, x1, x7 - adc x17, xzr, xzr - subs x13, x13, x8 - umulh x8, x9, x6 - sbcs x1, x15, x1 - sbcs x15, x3, x17 - sbcs x3, x14, xzr - mul x17, x2, x5 - sbcs x11, x11, xzr - stp x13, x1, [sp, #144] - sbc x14, x7, xzr - mul x7, x4, x10 - subs x1, x9, x2 - stp x15, x3, [sp, #160] - csetm x15, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - stp x11, x14, [sp, #176] - mul x14, x9, x6 - adds x17, x8, x17 - adcs x7, x16, x7 - adc x13, x12, xzr - subs x12, x5, x6 - cneg x3, x12, cc // cc = lo, ul, last - cinv x16, x15, cc // cc = lo, ul, last - mul x8, x1, x3 - umulh x1, x1, x3 - eor x12, x8, x16 - adds x11, x17, x14 - adcs x3, x7, x17 - adcs x15, x13, x7 - adc x8, x13, xzr - adds x3, x3, x14 - adcs x15, x15, x17 - adcs x17, x8, x7 - eor x1, x1, x16 - adc x13, x13, xzr - subs x9, x9, x4 - csetm x8, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x4, x2, x4 - cneg x4, x4, cc // cc = lo, ul, last - csetm x7, cc // cc = lo, ul, last - subs x2, x10, x6 - cinv x8, x8, cc // cc = lo, ul, last - cneg x2, x2, cc // cc = lo, ul, last - cmn x16, #0x1 - adcs x11, x11, x12 - mul x12, x9, x2 - adcs x3, x3, x1 - adcs x15, x15, x16 - umulh x9, x9, x2 - adcs x17, x17, x16 - adc x13, x13, x16 - subs x1, x10, x5 - cinv x2, x7, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - eor x9, x9, x8 - cmn x8, #0x1 - eor x7, x12, x8 - mul x12, x4, x1 - adcs x3, x3, x7 - adcs x7, x15, x9 - adcs x15, x17, x8 - ldp x9, x17, [sp, #160] - umulh x4, x4, x1 - adc x8, x13, x8 - cmn x2, #0x1 - eor x1, x12, x2 - adcs x1, x7, x1 - ldp x7, x16, [sp, #144] - eor x12, x4, x2 - adcs x4, x15, x12 - ldp x15, x12, [sp, #176] - adc x8, x8, x2 - adds x13, x14, x14 - umulh x14, x5, x10 - adcs x2, x11, x11 - adcs x3, x3, x3 - adcs x1, x1, x1 - adcs x4, x4, x4 - adcs x11, x8, x8 - adc x8, xzr, xzr - adds x13, x13, x7 - adcs x2, x2, x16 - mul x16, x5, x10 - adcs x3, x3, x9 - adcs x1, x1, x17 - umulh x5, x5, x5 - lsl x9, x13, #32 - add x9, x9, x13 - adcs x4, x4, x15 - mov x13, v28.d[1] - adcs x15, x11, x12 - lsr x7, x9, #32 - adc x11, x8, xzr - subs x7, x7, x9 - umulh x10, x10, x10 - sbc x17, x9, xzr - extr x7, x17, x7, #32 - lsr x17, x17, #32 - adds x17, x17, x9 - adc x12, xzr, xzr - subs x8, x2, x7 - sbcs x17, x3, x17 - lsl x7, x8, #32 - sbcs x2, x1, x12 - add x3, x7, x8 - sbcs x12, x4, xzr - lsr x1, x3, #32 - sbcs x7, x15, xzr - sbc x15, x9, xzr - subs x1, x1, x3 - sbc x4, x3, xzr - lsr x9, x4, #32 - extr x8, x4, x1, #32 - adds x9, x9, x3 - adc x4, xzr, xzr - subs x1, x17, x8 - lsl x17, x1, #32 - sbcs x8, x2, x9 - sbcs x9, x12, x4 - add x17, x17, x1 - mov x1, v18.d[1] - lsr x2, x17, #32 - sbcs x7, x7, xzr - mov x12, v18.d[0] - sbcs x15, x15, xzr - sbc x3, x3, xzr - subs x4, x2, x17 - sbc x2, x17, xzr - adds x12, x13, x12 - adcs x16, x16, x1 - lsr x13, x2, #32 - extr x1, x2, x4, #32 - adc x2, x14, xzr - adds x4, x13, x17 - mul x13, x6, x6 - adc x14, xzr, xzr - subs x1, x8, x1 - sbcs x4, x9, x4 - mov x9, v28.d[0] - sbcs x7, x7, x14 - sbcs x8, x15, xzr - sbcs x3, x3, xzr - sbc x14, x17, xzr - adds x17, x9, x9 - adcs x12, x12, x12 - mov x15, v19.d[0] - adcs x9, x16, x16 - umulh x6, x6, x6 - adcs x16, x2, x2 - adc x2, xzr, xzr - adds x11, x11, x8 - adcs x3, x3, xzr - adcs x14, x14, xzr - adcs x8, xzr, xzr - adds x13, x1, x13 - mov x1, v19.d[1] - adcs x6, x4, x6 - mov x4, #0xffffffff // #4294967295 - adcs x15, x7, x15 - adcs x7, x11, x5 - adcs x1, x3, x1 - adcs x14, x14, x10 - adc x11, x8, xzr - adds x6, x6, x17 - adcs x8, x15, x12 - adcs x3, x7, x9 - adcs x15, x1, x16 - mov x16, #0xffffffff00000001 // #-4294967295 - adcs x14, x14, x2 - mov x2, #0x1 // #1 - adc x17, x11, xzr - cmn x13, x16 - adcs xzr, x6, x4 - adcs xzr, x8, x2 - adcs xzr, x3, xzr - adcs xzr, x15, xzr - adcs xzr, x14, xzr - adc x1, x17, xzr - neg x9, x1 - and x1, x16, x9 - adds x11, x13, x1 - and x13, x4, x9 - adcs x5, x6, x13 - and x1, x2, x9 - adcs x7, x8, x1 - stp x11, x5, [sp, #144] - adcs x11, x3, xzr - adcs x2, x15, xzr - stp x7, x11, [sp, #160] - adc x17, x14, xzr - stp x2, x17, [sp, #176] - mov x0, sp - ldr q1, [sp, #48] - ldp x9, x2, [sp, #48] - ldr q0, [sp, #48] - ldp x4, x6, [sp, #64] - rev64 v21.4s, v1.4s - uzp2 v28.4s, v1.4s, v1.4s - umulh x7, x9, x2 - xtn v17.2s, v1.2d - mul v27.4s, v21.4s, v0.4s - ldr q20, [sp, #80] - xtn v30.2s, v0.2d - ldr q1, [sp, #80] - uzp2 v31.4s, v0.4s, v0.4s - ldp x5, x10, [sp, #80] - umulh x8, x9, x4 - uaddlp v3.2d, v27.4s - umull v16.2d, v30.2s, v17.2s - mul x16, x9, x4 - umull v27.2d, v30.2s, v28.2s - shrn v0.2s, v20.2d, #32 - xtn v7.2s, v20.2d - shl v20.2d, v3.2d, #32 - umull v3.2d, v31.2s, v28.2s - mul x3, x2, x4 - umlal v20.2d, v30.2s, v17.2s - umull v22.2d, v7.2s, v0.2s - usra v27.2d, v16.2d, #32 - umulh x11, x2, x4 - movi v21.2d, #0xffffffff - uzp2 v28.4s, v1.4s, v1.4s - adds x15, x16, x7 - and v5.16b, v27.16b, v21.16b - adcs x3, x3, x8 - usra v3.2d, v27.2d, #32 - dup v29.2d, x6 - adcs x16, x11, xzr - mov x14, v20.d[0] - umlal v5.2d, v31.2s, v17.2s - mul x8, x9, x2 - mov x7, v20.d[1] - shl v19.2d, v22.2d, #33 - xtn v25.2s, v29.2d - rev64 v31.4s, v1.4s - lsl x13, x14, #32 - uzp2 v6.4s, v29.4s, v29.4s - umlal v19.2d, v7.2s, v7.2s - usra v3.2d, v5.2d, #32 - adds x1, x8, x8 - umulh x8, x4, x4 - add x12, x13, x14 - mul v17.4s, v31.4s, v29.4s - xtn v4.2s, v1.2d - adcs x14, x15, x15 - lsr x13, x12, #32 - adcs x15, x3, x3 - umull v31.2d, v25.2s, v28.2s - adcs x11, x16, x16 - umull v21.2d, v25.2s, v4.2s - mov x17, v3.d[0] - umull v18.2d, v6.2s, v28.2s - adc x16, x8, xzr - uaddlp v16.2d, v17.4s - movi v1.2d, #0xffffffff - subs x13, x13, x12 - usra v31.2d, v21.2d, #32 - sbc x8, x12, xzr - adds x17, x17, x1 - mul x1, x4, x4 - shl v28.2d, v16.2d, #32 - mov x3, v3.d[1] - adcs x14, x7, x14 - extr x7, x8, x13, #32 - adcs x13, x3, x15 - and v3.16b, v31.16b, v1.16b - adcs x11, x1, x11 - lsr x1, x8, #32 - umlal v3.2d, v6.2s, v4.2s - usra v18.2d, v31.2d, #32 - adc x3, x16, xzr - adds x1, x1, x12 - umlal v28.2d, v25.2s, v4.2s - adc x16, xzr, xzr - subs x15, x17, x7 - sbcs x7, x14, x1 - lsl x1, x15, #32 - sbcs x16, x13, x16 - add x8, x1, x15 - usra v18.2d, v3.2d, #32 - sbcs x14, x11, xzr - lsr x1, x8, #32 - sbcs x17, x3, xzr - sbc x11, x12, xzr - subs x13, x1, x8 - umulh x12, x4, x10 - sbc x1, x8, xzr - extr x13, x1, x13, #32 - lsr x1, x1, #32 - adds x15, x1, x8 - adc x1, xzr, xzr - subs x7, x7, x13 - sbcs x13, x16, x15 - lsl x3, x7, #32 - umulh x16, x2, x5 - sbcs x15, x14, x1 - add x7, x3, x7 - sbcs x3, x17, xzr - lsr x1, x7, #32 - sbcs x14, x11, xzr - sbc x11, x8, xzr - subs x8, x1, x7 - sbc x1, x7, xzr - extr x8, x1, x8, #32 - lsr x1, x1, #32 - adds x1, x1, x7 - adc x17, xzr, xzr - subs x13, x13, x8 - umulh x8, x9, x6 - sbcs x1, x15, x1 - sbcs x15, x3, x17 - sbcs x3, x14, xzr - mul x17, x2, x5 - sbcs x11, x11, xzr - stp x13, x1, [x0] - sbc x14, x7, xzr - mul x7, x4, x10 - subs x1, x9, x2 - stp x15, x3, [x0, #16] - csetm x15, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - stp x11, x14, [x0, #32] - mul x14, x9, x6 - adds x17, x8, x17 - adcs x7, x16, x7 - adc x13, x12, xzr - subs x12, x5, x6 - cneg x3, x12, cc // cc = lo, ul, last - cinv x16, x15, cc // cc = lo, ul, last - mul x8, x1, x3 - umulh x1, x1, x3 - eor x12, x8, x16 - adds x11, x17, x14 - adcs x3, x7, x17 - adcs x15, x13, x7 - adc x8, x13, xzr - adds x3, x3, x14 - adcs x15, x15, x17 - adcs x17, x8, x7 - eor x1, x1, x16 - adc x13, x13, xzr - subs x9, x9, x4 - csetm x8, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x4, x2, x4 - cneg x4, x4, cc // cc = lo, ul, last - csetm x7, cc // cc = lo, ul, last - subs x2, x10, x6 - cinv x8, x8, cc // cc = lo, ul, last - cneg x2, x2, cc // cc = lo, ul, last - cmn x16, #0x1 - adcs x11, x11, x12 - mul x12, x9, x2 - adcs x3, x3, x1 - adcs x15, x15, x16 - umulh x9, x9, x2 - adcs x17, x17, x16 - adc x13, x13, x16 - subs x1, x10, x5 - cinv x2, x7, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - eor x9, x9, x8 - cmn x8, #0x1 - eor x7, x12, x8 - mul x12, x4, x1 - adcs x3, x3, x7 - adcs x7, x15, x9 - adcs x15, x17, x8 - ldp x9, x17, [x0, #16] - umulh x4, x4, x1 - adc x8, x13, x8 - cmn x2, #0x1 - eor x1, x12, x2 - adcs x1, x7, x1 - ldp x7, x16, [x0] - eor x12, x4, x2 - adcs x4, x15, x12 - ldp x15, x12, [x0, #32] - adc x8, x8, x2 - adds x13, x14, x14 - umulh x14, x5, x10 - adcs x2, x11, x11 - adcs x3, x3, x3 - adcs x1, x1, x1 - adcs x4, x4, x4 - adcs x11, x8, x8 - adc x8, xzr, xzr - adds x13, x13, x7 - adcs x2, x2, x16 - mul x16, x5, x10 - adcs x3, x3, x9 - adcs x1, x1, x17 - umulh x5, x5, x5 - lsl x9, x13, #32 - add x9, x9, x13 - adcs x4, x4, x15 - mov x13, v28.d[1] - adcs x15, x11, x12 - lsr x7, x9, #32 - adc x11, x8, xzr - subs x7, x7, x9 - umulh x10, x10, x10 - sbc x17, x9, xzr - extr x7, x17, x7, #32 - lsr x17, x17, #32 - adds x17, x17, x9 - adc x12, xzr, xzr - subs x8, x2, x7 - sbcs x17, x3, x17 - lsl x7, x8, #32 - sbcs x2, x1, x12 - add x3, x7, x8 - sbcs x12, x4, xzr - lsr x1, x3, #32 - sbcs x7, x15, xzr - sbc x15, x9, xzr - subs x1, x1, x3 - sbc x4, x3, xzr - lsr x9, x4, #32 - extr x8, x4, x1, #32 - adds x9, x9, x3 - adc x4, xzr, xzr - subs x1, x17, x8 - lsl x17, x1, #32 - sbcs x8, x2, x9 - sbcs x9, x12, x4 - add x17, x17, x1 - mov x1, v18.d[1] - lsr x2, x17, #32 - sbcs x7, x7, xzr - mov x12, v18.d[0] - sbcs x15, x15, xzr - sbc x3, x3, xzr - subs x4, x2, x17 - sbc x2, x17, xzr - adds x12, x13, x12 - adcs x16, x16, x1 - lsr x13, x2, #32 - extr x1, x2, x4, #32 - adc x2, x14, xzr - adds x4, x13, x17 - mul x13, x6, x6 - adc x14, xzr, xzr - subs x1, x8, x1 - sbcs x4, x9, x4 - mov x9, v28.d[0] - sbcs x7, x7, x14 - sbcs x8, x15, xzr - sbcs x3, x3, xzr - sbc x14, x17, xzr - adds x17, x9, x9 - adcs x12, x12, x12 - mov x15, v19.d[0] - adcs x9, x16, x16 - umulh x6, x6, x6 - adcs x16, x2, x2 - adc x2, xzr, xzr - adds x11, x11, x8 - adcs x3, x3, xzr - adcs x14, x14, xzr - adcs x8, xzr, xzr - adds x13, x1, x13 - mov x1, v19.d[1] - adcs x6, x4, x6 - mov x4, #0xffffffff // #4294967295 - adcs x15, x7, x15 - adcs x7, x11, x5 - adcs x1, x3, x1 - adcs x14, x14, x10 - adc x11, x8, xzr - adds x6, x6, x17 - adcs x8, x15, x12 - adcs x3, x7, x9 - adcs x15, x1, x16 - mov x16, #0xffffffff00000001 // #-4294967295 - adcs x14, x14, x2 - mov x2, #0x1 // #1 - adc x17, x11, xzr - cmn x13, x16 - adcs xzr, x6, x4 - adcs xzr, x8, x2 - adcs xzr, x3, xzr - adcs xzr, x15, xzr - adcs xzr, x14, xzr - adc x1, x17, xzr - neg x9, x1 - and x1, x16, x9 - adds x11, x13, x1 - and x13, x4, x9 - adcs x5, x6, x13 - and x1, x2, x9 - adcs x7, x8, x1 - stp x11, x5, [x0] - adcs x11, x3, xzr - adcs x2, x15, xzr - stp x7, x11, [x0, #16] - adc x17, x14, xzr - stp x2, x17, [x0, #32] - ldr q3, [sp, #144] - ldr q25, [sp, #192] - ldp x13, x23, [sp, #192] - ldp x3, x21, [sp, #144] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [sp, #224] - ldp x8, x24, [sp, #160] - subs x6, x3, x21 - ldr q0, [sp, #176] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [sp, #208] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [sp, #224] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [sp, #176] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #192] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #208] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #224] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #192] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #208] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #224] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #192] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #208] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #224] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #192] - ldp x21, x12, [sp, #208] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #224] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x14, x24, x11 - stp x22, x5, [sp, #192] - adcs x5, x13, x23 - adcs x21, x8, x23 - stp x14, x5, [sp, #208] - adc x12, x15, x23 - stp x21, x12, [sp, #224] - ldr q3, [sp, #144] - ldr q25, [sp, #96] - ldp x13, x23, [sp, #96] - ldp x3, x21, [sp, #144] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [sp, #128] - ldp x8, x24, [sp, #160] - subs x6, x3, x21 - ldr q0, [sp, #176] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [sp, #112] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [sp, #128] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [sp, #176] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #96] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #112] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #128] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #96] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #112] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #128] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #96] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #112] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #128] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #96] - ldp x21, x12, [sp, #112] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #128] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x2, x24, x11 - stp x22, x5, [sp, #96] - adcs x11, x13, x23 - adcs x12, x8, x23 - stp x2, x11, [sp, #112] - adc x13, x15, x23 - stp x12, x13, [sp, #128] - mov x0, sp - mov x1, sp - ldp x5, x6, [x1] - ldp x4, x3, [sp, #192] - subs x5, x5, x4 - sbcs x6, x6, x3 - ldp x7, x8, [x1, #16] - ldp x4, x3, [sp, #208] - sbcs x7, x7, x4 - sbcs x8, x8, x3 - ldp x9, x10, [x1, #32] - ldp x4, x3, [sp, #224] - sbcs x9, x9, x4 - sbcs x10, x10, x3 - csetm x3, cc // cc = lo, ul, last - mov x4, #0xffffffff // #4294967295 - and x4, x4, x3 - adds x5, x5, x4 - eor x4, x4, x3 - adcs x6, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - and x4, x4, x3 - adcs x7, x7, x4 - adcs x8, x8, x3 - adcs x9, x9, x3 - adc x10, x10, x3 - stp x5, x6, [x0] - stp x7, x8, [x0, #16] - stp x9, x10, [x0, #32] - ldp x5, x6, [sp, #96] - ldp x4, x3, [sp, #192] - subs x5, x5, x4 - sbcs x6, x6, x3 - ldp x4, x3, [sp, #208] - sbcs x7, x2, x4 - sbcs x8, x11, x3 - ldp x4, x3, [sp, #224] - sbcs x9, x12, x4 - sbcs x10, x13, x3 - csetm x3, cc // cc = lo, ul, last - mov x4, #0xffffffff // #4294967295 - and x4, x4, x3 - adds x5, x5, x4 - eor x4, x4, x3 - adcs x6, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - and x4, x4, x3 - adcs x7, x7, x4 - adcs x8, x8, x3 - adcs x9, x9, x3 - adc x10, x10, x3 - stp x5, x6, [sp, #144] - stp x7, x8, [sp, #160] - stp x9, x10, [sp, #176] - ldr q3, [sp, #240] - ldr q25, [x25, #96] - ldp x13, x23, [x25, #96] - ldp x3, x21, [sp, #240] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [x25, #128] - ldp x8, x24, [sp, #256] - subs x6, x3, x21 - ldr q0, [sp, #272] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [x25, #112] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [x25, #128] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [sp, #272] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #240] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #256] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #272] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #240] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #256] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #272] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #240] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #256] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #272] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #240] - ldp x21, x12, [sp, #256] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #272] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x14, x24, x11 - stp x22, x5, [sp, #240] - adcs x5, x13, x23 - adcs x21, x8, x23 - stp x14, x5, [sp, #256] - adc x12, x15, x23 - stp x21, x12, [sp, #272] - mov x0, sp - mov x1, sp - ldp x5, x6, [x1] - ldp x4, x3, [sp, #96] - subs x5, x5, x4 - sbcs x6, x6, x3 - ldp x7, x8, [x1, #16] - ldp x4, x3, [sp, #112] - sbcs x7, x7, x4 - sbcs x8, x8, x3 - ldp x9, x10, [x1, #32] - ldp x4, x3, [sp, #128] - sbcs x9, x9, x4 - sbcs x10, x10, x3 - csetm x3, cc // cc = lo, ul, last - mov x4, #0xffffffff // #4294967295 - and x4, x4, x3 - adds x2, x5, x4 - eor x4, x4, x3 - adcs x11, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - and x4, x4, x3 - adcs x4, x7, x4 - adcs x12, x8, x3 - adcs x13, x9, x3 - adc x3, x10, x3 - stp x2, x11, [x0] - stp x4, x12, [x0, #16] - stp x13, x3, [x0, #32] - ldp x5, x6, [sp, #192] - subs x5, x5, x2 - sbcs x6, x6, x11 - ldp x7, x8, [sp, #208] - sbcs x7, x7, x4 - sbcs x8, x8, x12 - ldp x9, x10, [sp, #224] - sbcs x9, x9, x13 - sbcs x10, x10, x3 - csetm x3, cc // cc = lo, ul, last - mov x4, #0xffffffff // #4294967295 - and x4, x4, x3 - adds x5, x5, x4 - eor x4, x4, x3 - adcs x6, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - and x4, x4, x3 - adcs x7, x7, x4 - adcs x8, x8, x3 - adcs x9, x9, x3 - adc x10, x10, x3 - stp x5, x6, [sp, #192] - stp x7, x8, [sp, #208] - stp x9, x10, [sp, #224] - ldr q3, [sp, #144] - ldr q25, [sp, #288] - ldp x13, x23, [sp, #288] - ldp x3, x21, [sp, #144] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [sp, #320] - ldp x8, x24, [sp, #160] - subs x6, x3, x21 - ldr q0, [sp, #176] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [sp, #304] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [sp, #320] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [sp, #176] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #144] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #160] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #176] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #144] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #160] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #176] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #144] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #160] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #176] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #144] - ldp x21, x12, [sp, #160] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #176] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x14, x24, x11 - stp x22, x5, [sp, #144] - adcs x5, x13, x23 - adcs x21, x8, x23 - stp x14, x5, [sp, #160] - adc x12, x15, x23 - stp x21, x12, [sp, #176] - ldr q3, [sp, #240] - ldr q25, [x26, #96] - ldp x13, x23, [x26, #96] - ldp x3, x21, [sp, #240] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [x26, #128] - ldp x8, x24, [sp, #256] - subs x6, x3, x21 - ldr q0, [sp, #272] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [x26, #112] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [x26, #128] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [sp, #272] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #240] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #256] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #272] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #240] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #256] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #272] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #240] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #256] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #272] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #240] - ldp x21, x12, [sp, #256] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #272] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x14, x24, x11 - stp x22, x5, [sp, #240] - adcs x5, x13, x23 - adcs x21, x8, x23 - stp x14, x5, [sp, #256] - adc x12, x15, x23 - stp x21, x12, [sp, #272] - ldp x2, x27, [sp, #0x150] - ldr q3, [sp, #48] - ldr q25, [sp, #192] - ldp x13, x23, [sp, #192] - ldp x3, x21, [sp, #48] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [sp, #224] - ldp x8, x24, [sp, #64] - subs x6, x3, x21 - ldr q0, [sp, #80] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [sp, #208] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [sp, #224] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [sp, #80] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #192] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #208] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #224] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #192] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #208] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #224] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #192] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #208] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #224] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #192] - ldp x21, x12, [sp, #208] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #224] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x2, x6, x20 - eor x3, x20, x23 - adcs x6, x7, x3 - adcs x7, x24, x11 - adcs x9, x13, x23 - adcs x10, x8, x23 - adc x11, x15, x23 - ldp x4, x3, [sp, #144] - subs x5, x2, x4 - sbcs x6, x6, x3 - ldp x4, x3, [sp, #160] - sbcs x7, x7, x4 - sbcs x8, x9, x3 - ldp x4, x3, [sp, #176] - sbcs x9, x10, x4 - sbcs x10, x11, x3 - csetm x3, cc // cc = lo, ul, last - mov x4, #0xffffffff // #4294967295 - and x4, x4, x3 - adds x19, x5, x4 - eor x4, x4, x3 - adcs x24, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - and x4, x4, x3 - adcs x7, x7, x4 - adcs x8, x8, x3 - adcs x9, x9, x3 - adc x10, x10, x3 - stp x7, x8, [sp, #208] - stp x9, x10, [sp, #224] - ldp x0, x1, [x25, #96] - ldp x2, x3, [x25, #112] - ldp x4, x5, [x25, #128] - orr x20, x0, x1 - orr x21, x2, x3 - orr x22, x4, x5 - orr x20, x20, x21 - orr x20, x20, x22 - cmp x20, xzr - cset x20, ne // ne = any - ldp x6, x7, [x26, #96] - ldp x8, x9, [x26, #112] - ldp x10, x11, [x26, #128] - orr x21, x6, x7 - orr x22, x8, x9 - orr x23, x10, x11 - orr x21, x21, x22 - orr x21, x21, x23 - cmp x21, xzr - cset x21, ne // ne = any - cmp x21, x20 - ldp x12, x13, [sp, #240] - csel x12, x0, x12, cc // cc = lo, ul, last - csel x13, x1, x13, cc // cc = lo, ul, last - csel x12, x6, x12, hi // hi = pmore - csel x13, x7, x13, hi // hi = pmore - ldp x14, x15, [sp, #256] - csel x14, x2, x14, cc // cc = lo, ul, last - csel x15, x3, x15, cc // cc = lo, ul, last - csel x14, x8, x14, hi // hi = pmore - csel x15, x9, x15, hi // hi = pmore - ldp x16, x17, [sp, #272] - csel x16, x4, x16, cc // cc = lo, ul, last - csel x17, x5, x17, cc // cc = lo, ul, last - csel x16, x10, x16, hi // hi = pmore - csel x17, x11, x17, hi // hi = pmore - ldp x20, x21, [x25] - ldp x0, x1, [sp] - csel x0, x20, x0, cc // cc = lo, ul, last - csel x1, x21, x1, cc // cc = lo, ul, last - ldp x20, x21, [x26] - csel x0, x20, x0, hi // hi = pmore - csel x1, x21, x1, hi // hi = pmore - ldp x20, x21, [x25, #16] - ldp x2, x3, [sp, #16] - csel x2, x20, x2, cc // cc = lo, ul, last - csel x3, x21, x3, cc // cc = lo, ul, last - ldp x20, x21, [x26, #16] - csel x2, x20, x2, hi // hi = pmore - csel x3, x21, x3, hi // hi = pmore - ldp x20, x21, [x25, #32] - ldp x4, x5, [sp, #32] - csel x4, x20, x4, cc // cc = lo, ul, last - csel x5, x21, x5, cc // cc = lo, ul, last - ldp x20, x21, [x26, #32] - csel x4, x20, x4, hi // hi = pmore - csel x5, x21, x5, hi // hi = pmore - ldp x20, x21, [x25, #48] - csel x6, x20, x19, cc // cc = lo, ul, last - csel x7, x21, x24, cc // cc = lo, ul, last - ldp x20, x21, [x26, #48] - csel x6, x20, x6, hi // hi = pmore - csel x7, x21, x7, hi // hi = pmore - ldp x20, x21, [x25, #64] - ldp x8, x9, [sp, #208] - csel x8, x20, x8, cc // cc = lo, ul, last - csel x9, x21, x9, cc // cc = lo, ul, last - ldp x20, x21, [x26, #64] - csel x8, x20, x8, hi // hi = pmore - csel x9, x21, x9, hi // hi = pmore - ldp x20, x21, [x25, #80] - ldp x10, x11, [sp, #224] - csel x10, x20, x10, cc // cc = lo, ul, last - csel x11, x21, x11, cc // cc = lo, ul, last - ldp x20, x21, [x26, #80] - csel x10, x20, x10, hi // hi = pmore - csel x11, x21, x11, hi // hi = pmore - stp x0, x1, [x27] - stp x2, x3, [x27, #16] - stp x4, x5, [x27, #32] - stp x6, x7, [x27, #48] - stp x8, x9, [x27, #64] - stp x10, x11, [x27, #80] - stp x12, x13, [x27, #96] - stp x14, x15, [x27, #112] - stp x16, x17, [x27, #128] - add sp, sp, #0x180 - ldp x27, xzr, [sp], #16 - ldp x25, x26, [sp], #16 - ldp x23, x24, [sp], #16 - ldp x21, x22, [sp], #16 - ldp x19, x20, [sp], #16 - ret - -p384_montjscalarmul_p384_montjdouble: - sub sp, sp, #0x1a0 - stp x19, x20, [sp, #336] - stp x21, x22, [sp, #352] - stp x23, x24, [sp, #368] - stp x25, x26, [sp, #384] - stp x27, xzr, [sp, #400] - mov x25, x0 - mov x26, x1 - mov x0, sp - ldr q1, [x26, #96] - ldp x9, x2, [x26, #96] - ldr q0, [x26, #96] - ldp x4, x6, [x26, #112] - rev64 v21.4s, v1.4s - uzp2 v28.4s, v1.4s, v1.4s - umulh x7, x9, x2 - xtn v17.2s, v1.2d - mul v27.4s, v21.4s, v0.4s - ldr q20, [x26, #128] - xtn v30.2s, v0.2d - ldr q1, [x26, #128] - uzp2 v31.4s, v0.4s, v0.4s - ldp x5, x10, [x26, #128] - umulh x8, x9, x4 - uaddlp v3.2d, v27.4s - umull v16.2d, v30.2s, v17.2s - mul x16, x9, x4 - umull v27.2d, v30.2s, v28.2s - shrn v0.2s, v20.2d, #32 - xtn v7.2s, v20.2d - shl v20.2d, v3.2d, #32 - umull v3.2d, v31.2s, v28.2s - mul x3, x2, x4 - umlal v20.2d, v30.2s, v17.2s - umull v22.2d, v7.2s, v0.2s - usra v27.2d, v16.2d, #32 - umulh x11, x2, x4 - movi v21.2d, #0xffffffff - uzp2 v28.4s, v1.4s, v1.4s - adds x15, x16, x7 - and v5.16b, v27.16b, v21.16b - adcs x3, x3, x8 - usra v3.2d, v27.2d, #32 - dup v29.2d, x6 - adcs x16, x11, xzr - mov x14, v20.d[0] - umlal v5.2d, v31.2s, v17.2s - mul x8, x9, x2 - mov x7, v20.d[1] - shl v19.2d, v22.2d, #33 - xtn v25.2s, v29.2d - rev64 v31.4s, v1.4s - lsl x13, x14, #32 - uzp2 v6.4s, v29.4s, v29.4s - umlal v19.2d, v7.2s, v7.2s - usra v3.2d, v5.2d, #32 - adds x1, x8, x8 - umulh x8, x4, x4 - add x12, x13, x14 - mul v17.4s, v31.4s, v29.4s - xtn v4.2s, v1.2d - adcs x14, x15, x15 - lsr x13, x12, #32 - adcs x15, x3, x3 - umull v31.2d, v25.2s, v28.2s - adcs x11, x16, x16 - umull v21.2d, v25.2s, v4.2s - mov x17, v3.d[0] - umull v18.2d, v6.2s, v28.2s - adc x16, x8, xzr - uaddlp v16.2d, v17.4s - movi v1.2d, #0xffffffff - subs x13, x13, x12 - usra v31.2d, v21.2d, #32 - sbc x8, x12, xzr - adds x17, x17, x1 - mul x1, x4, x4 - shl v28.2d, v16.2d, #32 - mov x3, v3.d[1] - adcs x14, x7, x14 - extr x7, x8, x13, #32 - adcs x13, x3, x15 - and v3.16b, v31.16b, v1.16b - adcs x11, x1, x11 - lsr x1, x8, #32 - umlal v3.2d, v6.2s, v4.2s - usra v18.2d, v31.2d, #32 - adc x3, x16, xzr - adds x1, x1, x12 - umlal v28.2d, v25.2s, v4.2s - adc x16, xzr, xzr - subs x15, x17, x7 - sbcs x7, x14, x1 - lsl x1, x15, #32 - sbcs x16, x13, x16 - add x8, x1, x15 - usra v18.2d, v3.2d, #32 - sbcs x14, x11, xzr - lsr x1, x8, #32 - sbcs x17, x3, xzr - sbc x11, x12, xzr - subs x13, x1, x8 - umulh x12, x4, x10 - sbc x1, x8, xzr - extr x13, x1, x13, #32 - lsr x1, x1, #32 - adds x15, x1, x8 - adc x1, xzr, xzr - subs x7, x7, x13 - sbcs x13, x16, x15 - lsl x3, x7, #32 - umulh x16, x2, x5 - sbcs x15, x14, x1 - add x7, x3, x7 - sbcs x3, x17, xzr - lsr x1, x7, #32 - sbcs x14, x11, xzr - sbc x11, x8, xzr - subs x8, x1, x7 - sbc x1, x7, xzr - extr x8, x1, x8, #32 - lsr x1, x1, #32 - adds x1, x1, x7 - adc x17, xzr, xzr - subs x13, x13, x8 - umulh x8, x9, x6 - sbcs x1, x15, x1 - sbcs x15, x3, x17 - sbcs x3, x14, xzr - mul x17, x2, x5 - sbcs x11, x11, xzr - stp x13, x1, [x0] - sbc x14, x7, xzr - mul x7, x4, x10 - subs x1, x9, x2 - stp x15, x3, [x0, #16] - csetm x15, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - stp x11, x14, [x0, #32] - mul x14, x9, x6 - adds x17, x8, x17 - adcs x7, x16, x7 - adc x13, x12, xzr - subs x12, x5, x6 - cneg x3, x12, cc // cc = lo, ul, last - cinv x16, x15, cc // cc = lo, ul, last - mul x8, x1, x3 - umulh x1, x1, x3 - eor x12, x8, x16 - adds x11, x17, x14 - adcs x3, x7, x17 - adcs x15, x13, x7 - adc x8, x13, xzr - adds x3, x3, x14 - adcs x15, x15, x17 - adcs x17, x8, x7 - eor x1, x1, x16 - adc x13, x13, xzr - subs x9, x9, x4 - csetm x8, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x4, x2, x4 - cneg x4, x4, cc // cc = lo, ul, last - csetm x7, cc // cc = lo, ul, last - subs x2, x10, x6 - cinv x8, x8, cc // cc = lo, ul, last - cneg x2, x2, cc // cc = lo, ul, last - cmn x16, #0x1 - adcs x11, x11, x12 - mul x12, x9, x2 - adcs x3, x3, x1 - adcs x15, x15, x16 - umulh x9, x9, x2 - adcs x17, x17, x16 - adc x13, x13, x16 - subs x1, x10, x5 - cinv x2, x7, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - eor x9, x9, x8 - cmn x8, #0x1 - eor x7, x12, x8 - mul x12, x4, x1 - adcs x3, x3, x7 - adcs x7, x15, x9 - adcs x15, x17, x8 - ldp x9, x17, [x0, #16] - umulh x4, x4, x1 - adc x8, x13, x8 - cmn x2, #0x1 - eor x1, x12, x2 - adcs x1, x7, x1 - ldp x7, x16, [x0] - eor x12, x4, x2 - adcs x4, x15, x12 - ldp x15, x12, [x0, #32] - adc x8, x8, x2 - adds x13, x14, x14 - umulh x14, x5, x10 - adcs x2, x11, x11 - adcs x3, x3, x3 - adcs x1, x1, x1 - adcs x4, x4, x4 - adcs x11, x8, x8 - adc x8, xzr, xzr - adds x13, x13, x7 - adcs x2, x2, x16 - mul x16, x5, x10 - adcs x3, x3, x9 - adcs x1, x1, x17 - umulh x5, x5, x5 - lsl x9, x13, #32 - add x9, x9, x13 - adcs x4, x4, x15 - mov x13, v28.d[1] - adcs x15, x11, x12 - lsr x7, x9, #32 - adc x11, x8, xzr - subs x7, x7, x9 - umulh x10, x10, x10 - sbc x17, x9, xzr - extr x7, x17, x7, #32 - lsr x17, x17, #32 - adds x17, x17, x9 - adc x12, xzr, xzr - subs x8, x2, x7 - sbcs x17, x3, x17 - lsl x7, x8, #32 - sbcs x2, x1, x12 - add x3, x7, x8 - sbcs x12, x4, xzr - lsr x1, x3, #32 - sbcs x7, x15, xzr - sbc x15, x9, xzr - subs x1, x1, x3 - sbc x4, x3, xzr - lsr x9, x4, #32 - extr x8, x4, x1, #32 - adds x9, x9, x3 - adc x4, xzr, xzr - subs x1, x17, x8 - lsl x17, x1, #32 - sbcs x8, x2, x9 - sbcs x9, x12, x4 - add x17, x17, x1 - mov x1, v18.d[1] - lsr x2, x17, #32 - sbcs x7, x7, xzr - mov x12, v18.d[0] - sbcs x15, x15, xzr - sbc x3, x3, xzr - subs x4, x2, x17 - sbc x2, x17, xzr - adds x12, x13, x12 - adcs x16, x16, x1 - lsr x13, x2, #32 - extr x1, x2, x4, #32 - adc x2, x14, xzr - adds x4, x13, x17 - mul x13, x6, x6 - adc x14, xzr, xzr - subs x1, x8, x1 - sbcs x4, x9, x4 - mov x9, v28.d[0] - sbcs x7, x7, x14 - sbcs x8, x15, xzr - sbcs x3, x3, xzr - sbc x14, x17, xzr - adds x17, x9, x9 - adcs x12, x12, x12 - mov x15, v19.d[0] - adcs x9, x16, x16 - umulh x6, x6, x6 - adcs x16, x2, x2 - adc x2, xzr, xzr - adds x11, x11, x8 - adcs x3, x3, xzr - adcs x14, x14, xzr - adcs x8, xzr, xzr - adds x13, x1, x13 - mov x1, v19.d[1] - adcs x6, x4, x6 - mov x4, #0xffffffff // #4294967295 - adcs x15, x7, x15 - adcs x7, x11, x5 - adcs x1, x3, x1 - adcs x14, x14, x10 - adc x11, x8, xzr - adds x6, x6, x17 - adcs x8, x15, x12 - adcs x3, x7, x9 - adcs x15, x1, x16 - mov x16, #0xffffffff00000001 // #-4294967295 - adcs x14, x14, x2 - mov x2, #0x1 // #1 - adc x17, x11, xzr - cmn x13, x16 - adcs xzr, x6, x4 - adcs xzr, x8, x2 - adcs xzr, x3, xzr - adcs xzr, x15, xzr - adcs xzr, x14, xzr - adc x1, x17, xzr - neg x9, x1 - and x1, x16, x9 - adds x11, x13, x1 - and x13, x4, x9 - adcs x5, x6, x13 - and x1, x2, x9 - adcs x7, x8, x1 - stp x11, x5, [x0] - adcs x11, x3, xzr - adcs x2, x15, xzr - stp x7, x11, [x0, #16] - adc x17, x14, xzr - stp x2, x17, [x0, #32] - ldr q1, [x26, #48] - ldp x9, x2, [x26, #48] - ldr q0, [x26, #48] - ldp x4, x6, [x26, #64] - rev64 v21.4s, v1.4s - uzp2 v28.4s, v1.4s, v1.4s - umulh x7, x9, x2 - xtn v17.2s, v1.2d - mul v27.4s, v21.4s, v0.4s - ldr q20, [x26, #80] - xtn v30.2s, v0.2d - ldr q1, [x26, #80] - uzp2 v31.4s, v0.4s, v0.4s - ldp x5, x10, [x26, #80] - umulh x8, x9, x4 - uaddlp v3.2d, v27.4s - umull v16.2d, v30.2s, v17.2s - mul x16, x9, x4 - umull v27.2d, v30.2s, v28.2s - shrn v0.2s, v20.2d, #32 - xtn v7.2s, v20.2d - shl v20.2d, v3.2d, #32 - umull v3.2d, v31.2s, v28.2s - mul x3, x2, x4 - umlal v20.2d, v30.2s, v17.2s - umull v22.2d, v7.2s, v0.2s - usra v27.2d, v16.2d, #32 - umulh x11, x2, x4 - movi v21.2d, #0xffffffff - uzp2 v28.4s, v1.4s, v1.4s - adds x15, x16, x7 - and v5.16b, v27.16b, v21.16b - adcs x3, x3, x8 - usra v3.2d, v27.2d, #32 - dup v29.2d, x6 - adcs x16, x11, xzr - mov x14, v20.d[0] - umlal v5.2d, v31.2s, v17.2s - mul x8, x9, x2 - mov x7, v20.d[1] - shl v19.2d, v22.2d, #33 - xtn v25.2s, v29.2d - rev64 v31.4s, v1.4s - lsl x13, x14, #32 - uzp2 v6.4s, v29.4s, v29.4s - umlal v19.2d, v7.2s, v7.2s - usra v3.2d, v5.2d, #32 - adds x1, x8, x8 - umulh x8, x4, x4 - add x12, x13, x14 - mul v17.4s, v31.4s, v29.4s - xtn v4.2s, v1.2d - adcs x14, x15, x15 - lsr x13, x12, #32 - adcs x15, x3, x3 - umull v31.2d, v25.2s, v28.2s - adcs x11, x16, x16 - umull v21.2d, v25.2s, v4.2s - mov x17, v3.d[0] - umull v18.2d, v6.2s, v28.2s - adc x16, x8, xzr - uaddlp v16.2d, v17.4s - movi v1.2d, #0xffffffff - subs x13, x13, x12 - usra v31.2d, v21.2d, #32 - sbc x8, x12, xzr - adds x17, x17, x1 - mul x1, x4, x4 - shl v28.2d, v16.2d, #32 - mov x3, v3.d[1] - adcs x14, x7, x14 - extr x7, x8, x13, #32 - adcs x13, x3, x15 - and v3.16b, v31.16b, v1.16b - adcs x11, x1, x11 - lsr x1, x8, #32 - umlal v3.2d, v6.2s, v4.2s - usra v18.2d, v31.2d, #32 - adc x3, x16, xzr - adds x1, x1, x12 - umlal v28.2d, v25.2s, v4.2s - adc x16, xzr, xzr - subs x15, x17, x7 - sbcs x7, x14, x1 - lsl x1, x15, #32 - sbcs x16, x13, x16 - add x8, x1, x15 - usra v18.2d, v3.2d, #32 - sbcs x14, x11, xzr - lsr x1, x8, #32 - sbcs x17, x3, xzr - sbc x11, x12, xzr - subs x13, x1, x8 - umulh x12, x4, x10 - sbc x1, x8, xzr - extr x13, x1, x13, #32 - lsr x1, x1, #32 - adds x15, x1, x8 - adc x1, xzr, xzr - subs x7, x7, x13 - sbcs x13, x16, x15 - lsl x3, x7, #32 - umulh x16, x2, x5 - sbcs x15, x14, x1 - add x7, x3, x7 - sbcs x3, x17, xzr - lsr x1, x7, #32 - sbcs x14, x11, xzr - sbc x11, x8, xzr - subs x8, x1, x7 - sbc x1, x7, xzr - extr x8, x1, x8, #32 - lsr x1, x1, #32 - adds x1, x1, x7 - adc x17, xzr, xzr - subs x13, x13, x8 - umulh x8, x9, x6 - sbcs x1, x15, x1 - sbcs x15, x3, x17 - sbcs x3, x14, xzr - mul x17, x2, x5 - sbcs x11, x11, xzr - stp x13, x1, [sp, #48] - sbc x14, x7, xzr - mul x7, x4, x10 - subs x1, x9, x2 - stp x15, x3, [sp, #64] - csetm x15, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - stp x11, x14, [sp, #80] - mul x14, x9, x6 - adds x17, x8, x17 - adcs x7, x16, x7 - adc x13, x12, xzr - subs x12, x5, x6 - cneg x3, x12, cc // cc = lo, ul, last - cinv x16, x15, cc // cc = lo, ul, last - mul x8, x1, x3 - umulh x1, x1, x3 - eor x12, x8, x16 - adds x11, x17, x14 - adcs x3, x7, x17 - adcs x15, x13, x7 - adc x8, x13, xzr - adds x3, x3, x14 - adcs x15, x15, x17 - adcs x17, x8, x7 - eor x1, x1, x16 - adc x13, x13, xzr - subs x9, x9, x4 - csetm x8, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x4, x2, x4 - cneg x4, x4, cc // cc = lo, ul, last - csetm x7, cc // cc = lo, ul, last - subs x2, x10, x6 - cinv x8, x8, cc // cc = lo, ul, last - cneg x2, x2, cc // cc = lo, ul, last - cmn x16, #0x1 - adcs x11, x11, x12 - mul x12, x9, x2 - adcs x3, x3, x1 - adcs x15, x15, x16 - umulh x9, x9, x2 - adcs x17, x17, x16 - adc x13, x13, x16 - subs x1, x10, x5 - cinv x2, x7, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - eor x9, x9, x8 - cmn x8, #0x1 - eor x7, x12, x8 - mul x12, x4, x1 - adcs x3, x3, x7 - adcs x7, x15, x9 - adcs x15, x17, x8 - ldp x9, x17, [sp, #64] - umulh x4, x4, x1 - adc x8, x13, x8 - cmn x2, #0x1 - eor x1, x12, x2 - adcs x1, x7, x1 - ldp x7, x16, [sp, #48] - eor x12, x4, x2 - adcs x4, x15, x12 - ldp x15, x12, [sp, #80] - adc x8, x8, x2 - adds x13, x14, x14 - umulh x14, x5, x10 - adcs x2, x11, x11 - adcs x3, x3, x3 - adcs x1, x1, x1 - adcs x4, x4, x4 - adcs x11, x8, x8 - adc x8, xzr, xzr - adds x13, x13, x7 - adcs x2, x2, x16 - mul x16, x5, x10 - adcs x3, x3, x9 - adcs x1, x1, x17 - umulh x5, x5, x5 - lsl x9, x13, #32 - add x9, x9, x13 - adcs x4, x4, x15 - mov x13, v28.d[1] - adcs x15, x11, x12 - lsr x7, x9, #32 - adc x11, x8, xzr - subs x7, x7, x9 - umulh x10, x10, x10 - sbc x17, x9, xzr - extr x7, x17, x7, #32 - lsr x17, x17, #32 - adds x17, x17, x9 - adc x12, xzr, xzr - subs x8, x2, x7 - sbcs x17, x3, x17 - lsl x7, x8, #32 - sbcs x2, x1, x12 - add x3, x7, x8 - sbcs x12, x4, xzr - lsr x1, x3, #32 - sbcs x7, x15, xzr - sbc x15, x9, xzr - subs x1, x1, x3 - sbc x4, x3, xzr - lsr x9, x4, #32 - extr x8, x4, x1, #32 - adds x9, x9, x3 - adc x4, xzr, xzr - subs x1, x17, x8 - lsl x17, x1, #32 - sbcs x8, x2, x9 - sbcs x9, x12, x4 - add x17, x17, x1 - mov x1, v18.d[1] - lsr x2, x17, #32 - sbcs x7, x7, xzr - mov x12, v18.d[0] - sbcs x15, x15, xzr - sbc x3, x3, xzr - subs x4, x2, x17 - sbc x2, x17, xzr - adds x12, x13, x12 - adcs x16, x16, x1 - lsr x13, x2, #32 - extr x1, x2, x4, #32 - adc x2, x14, xzr - adds x4, x13, x17 - mul x13, x6, x6 - adc x14, xzr, xzr - subs x1, x8, x1 - sbcs x4, x9, x4 - mov x9, v28.d[0] - sbcs x7, x7, x14 - sbcs x8, x15, xzr - sbcs x3, x3, xzr - sbc x14, x17, xzr - adds x17, x9, x9 - adcs x12, x12, x12 - mov x15, v19.d[0] - adcs x9, x16, x16 - umulh x6, x6, x6 - adcs x16, x2, x2 - adc x2, xzr, xzr - adds x11, x11, x8 - adcs x3, x3, xzr - adcs x14, x14, xzr - adcs x8, xzr, xzr - adds x13, x1, x13 - mov x1, v19.d[1] - adcs x6, x4, x6 - mov x4, #0xffffffff // #4294967295 - adcs x15, x7, x15 - adcs x7, x11, x5 - adcs x1, x3, x1 - adcs x14, x14, x10 - adc x11, x8, xzr - adds x6, x6, x17 - adcs x8, x15, x12 - adcs x3, x7, x9 - adcs x15, x1, x16 - mov x16, #0xffffffff00000001 // #-4294967295 - adcs x14, x14, x2 - mov x2, #0x1 // #1 - adc x17, x11, xzr - cmn x13, x16 - adcs xzr, x6, x4 - adcs xzr, x8, x2 - adcs xzr, x3, xzr - adcs xzr, x15, xzr - adcs xzr, x14, xzr - adc x1, x17, xzr - neg x9, x1 - and x1, x16, x9 - adds x11, x13, x1 - and x13, x4, x9 - adcs x5, x6, x13 - and x1, x2, x9 - adcs x7, x8, x1 - stp x11, x5, [sp, #48] - adcs x11, x3, xzr - adcs x2, x15, xzr - stp x7, x11, [sp, #64] - adc x17, x14, xzr - stp x2, x17, [sp, #80] - ldp x5, x6, [x26] - ldp x4, x3, [sp] - adds x5, x5, x4 - adcs x6, x6, x3 - ldp x7, x8, [x26, #16] - ldp x4, x3, [sp, #16] - adcs x7, x7, x4 - adcs x8, x8, x3 - ldp x9, x10, [x26, #32] - ldp x4, x3, [sp, #32] - adcs x9, x9, x4 - adcs x10, x10, x3 - csetm x3, cs // cs = hs, nlast - mov x4, #0xffffffff // #4294967295 - and x4, x4, x3 - subs x5, x5, x4 - eor x4, x4, x3 - sbcs x6, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - and x4, x4, x3 - sbcs x7, x7, x4 - sbcs x8, x8, x3 - sbcs x9, x9, x3 - sbc x10, x10, x3 - stp x5, x6, [sp, #240] - stp x7, x8, [sp, #256] - stp x9, x10, [sp, #272] - mov x2, sp - ldp x5, x6, [x26] - ldp x4, x3, [x2] - subs x5, x5, x4 - sbcs x6, x6, x3 - ldp x7, x8, [x26, #16] - ldp x4, x3, [x2, #16] - sbcs x7, x7, x4 - sbcs x8, x8, x3 - ldp x9, x10, [x26, #32] - ldp x4, x3, [x2, #32] - sbcs x9, x9, x4 - sbcs x10, x10, x3 - csetm x3, cc // cc = lo, ul, last - mov x4, #0xffffffff // #4294967295 - and x4, x4, x3 - adds x13, x5, x4 - eor x4, x4, x3 - adcs x23, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - and x4, x4, x3 - adcs x7, x7, x4 - adcs x8, x8, x3 - adcs x9, x9, x3 - adc x10, x10, x3 - stp x13, x23, [sp, #192] - stp x7, x8, [sp, #208] - stp x9, x10, [sp, #224] - ldr q3, [sp, #240] - ldr q25, [sp, #192] - ldp x3, x21, [sp, #240] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [sp, #224] - ldp x8, x24, [sp, #256] - subs x6, x3, x21 - ldr q0, [sp, #272] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [sp, #208] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [sp, #224] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [sp, #272] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x16, x4, x16 - mov x4, v27.d[0] - sbcs x11, x20, x11 - sbcs x20, x9, x12 - stp x16, x11, [sp, #96] - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #112] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #128] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - ldp x20, x9, [sp, #96] - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #112] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #128] - adds x20, x22, x20 - mul x10, x13, x14 - adcs x11, x11, x9 - eor x9, x8, x21 - adcs x21, x19, x17 - stp x20, x11, [sp, #96] - adcs x12, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - stp x21, x12, [sp, #112] - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #128] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #96] - ldp x21, x12, [sp, #112] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #128] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x21 - eor x1, x22, x9 - adcs x24, x23, x12 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x21 - adcs x15, x17, x12 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x14, x24, x11 - stp x22, x5, [sp, #96] - adcs x5, x13, x23 - adcs x21, x8, x23 - stp x14, x5, [sp, #112] - adc x12, x15, x23 - stp x21, x12, [sp, #128] - ldp x5, x6, [x26, #48] - ldp x4, x3, [x26, #96] - adds x5, x5, x4 - adcs x6, x6, x3 - ldp x7, x8, [x26, #64] - ldp x4, x3, [x26, #112] - adcs x7, x7, x4 - adcs x8, x8, x3 - ldp x9, x10, [x26, #80] - ldp x4, x3, [x26, #128] - adcs x9, x9, x4 - adcs x10, x10, x3 - adc x3, xzr, xzr - mov x4, #0xffffffff // #4294967295 - cmp x5, x4 - mov x4, #0xffffffff00000000 // #-4294967296 - sbcs xzr, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - sbcs xzr, x7, x4 - adcs xzr, x8, xzr - adcs xzr, x9, xzr - adcs xzr, x10, xzr - adcs x3, x3, xzr - csetm x3, ne // ne = any - mov x4, #0xffffffff // #4294967295 - and x4, x4, x3 - subs x5, x5, x4 - eor x4, x4, x3 - sbcs x6, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - and x4, x4, x3 - sbcs x7, x7, x4 - sbcs x8, x8, x3 - sbcs x9, x9, x3 - sbc x10, x10, x3 - stp x5, x6, [sp, #240] - stp x7, x8, [sp, #256] - stp x9, x10, [sp, #272] - ldr q1, [sp, #96] - ldp x9, x2, [sp, #96] - ldr q0, [sp, #96] - ldp x4, x6, [sp, #112] - rev64 v21.4s, v1.4s - uzp2 v28.4s, v1.4s, v1.4s - umulh x7, x9, x2 - xtn v17.2s, v1.2d - mul v27.4s, v21.4s, v0.4s - ldr q20, [sp, #128] - xtn v30.2s, v0.2d - ldr q1, [sp, #128] - uzp2 v31.4s, v0.4s, v0.4s - ldp x5, x10, [sp, #128] - umulh x8, x9, x4 - uaddlp v3.2d, v27.4s - umull v16.2d, v30.2s, v17.2s - mul x16, x9, x4 - umull v27.2d, v30.2s, v28.2s - shrn v0.2s, v20.2d, #32 - xtn v7.2s, v20.2d - shl v20.2d, v3.2d, #32 - umull v3.2d, v31.2s, v28.2s - mul x3, x2, x4 - umlal v20.2d, v30.2s, v17.2s - umull v22.2d, v7.2s, v0.2s - usra v27.2d, v16.2d, #32 - umulh x11, x2, x4 - movi v21.2d, #0xffffffff - uzp2 v28.4s, v1.4s, v1.4s - adds x15, x16, x7 - and v5.16b, v27.16b, v21.16b - adcs x3, x3, x8 - usra v3.2d, v27.2d, #32 - dup v29.2d, x6 - adcs x16, x11, xzr - mov x14, v20.d[0] - umlal v5.2d, v31.2s, v17.2s - mul x8, x9, x2 - mov x7, v20.d[1] - shl v19.2d, v22.2d, #33 - xtn v25.2s, v29.2d - rev64 v31.4s, v1.4s - lsl x13, x14, #32 - uzp2 v6.4s, v29.4s, v29.4s - umlal v19.2d, v7.2s, v7.2s - usra v3.2d, v5.2d, #32 - adds x1, x8, x8 - umulh x8, x4, x4 - add x12, x13, x14 - mul v17.4s, v31.4s, v29.4s - xtn v4.2s, v1.2d - adcs x14, x15, x15 - lsr x13, x12, #32 - adcs x15, x3, x3 - umull v31.2d, v25.2s, v28.2s - adcs x11, x16, x16 - umull v21.2d, v25.2s, v4.2s - mov x17, v3.d[0] - umull v18.2d, v6.2s, v28.2s - adc x16, x8, xzr - uaddlp v16.2d, v17.4s - movi v1.2d, #0xffffffff - subs x13, x13, x12 - usra v31.2d, v21.2d, #32 - sbc x8, x12, xzr - adds x17, x17, x1 - mul x1, x4, x4 - shl v28.2d, v16.2d, #32 - mov x3, v3.d[1] - adcs x14, x7, x14 - extr x7, x8, x13, #32 - adcs x13, x3, x15 - and v3.16b, v31.16b, v1.16b - adcs x11, x1, x11 - lsr x1, x8, #32 - umlal v3.2d, v6.2s, v4.2s - usra v18.2d, v31.2d, #32 - adc x3, x16, xzr - adds x1, x1, x12 - umlal v28.2d, v25.2s, v4.2s - adc x16, xzr, xzr - subs x15, x17, x7 - sbcs x7, x14, x1 - lsl x1, x15, #32 - sbcs x16, x13, x16 - add x8, x1, x15 - usra v18.2d, v3.2d, #32 - sbcs x14, x11, xzr - lsr x1, x8, #32 - sbcs x17, x3, xzr - sbc x11, x12, xzr - subs x13, x1, x8 - umulh x12, x4, x10 - sbc x1, x8, xzr - extr x13, x1, x13, #32 - lsr x1, x1, #32 - adds x15, x1, x8 - adc x1, xzr, xzr - subs x7, x7, x13 - sbcs x13, x16, x15 - lsl x3, x7, #32 - umulh x16, x2, x5 - sbcs x15, x14, x1 - add x7, x3, x7 - sbcs x3, x17, xzr - lsr x1, x7, #32 - sbcs x14, x11, xzr - sbc x11, x8, xzr - subs x8, x1, x7 - sbc x1, x7, xzr - extr x8, x1, x8, #32 - lsr x1, x1, #32 - adds x1, x1, x7 - adc x17, xzr, xzr - subs x13, x13, x8 - umulh x8, x9, x6 - sbcs x1, x15, x1 - sbcs x15, x3, x17 - sbcs x3, x14, xzr - mul x17, x2, x5 - sbcs x11, x11, xzr - stp x13, x1, [sp, #288] - sbc x14, x7, xzr - mul x7, x4, x10 - subs x1, x9, x2 - stp x15, x3, [sp, #304] - csetm x15, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - stp x11, x14, [sp, #320] - mul x14, x9, x6 - adds x17, x8, x17 - adcs x7, x16, x7 - adc x13, x12, xzr - subs x12, x5, x6 - cneg x3, x12, cc // cc = lo, ul, last - cinv x16, x15, cc // cc = lo, ul, last - mul x8, x1, x3 - umulh x1, x1, x3 - eor x12, x8, x16 - adds x11, x17, x14 - adcs x3, x7, x17 - adcs x15, x13, x7 - adc x8, x13, xzr - adds x3, x3, x14 - adcs x15, x15, x17 - adcs x17, x8, x7 - eor x1, x1, x16 - adc x13, x13, xzr - subs x9, x9, x4 - csetm x8, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x4, x2, x4 - cneg x4, x4, cc // cc = lo, ul, last - csetm x7, cc // cc = lo, ul, last - subs x2, x10, x6 - cinv x8, x8, cc // cc = lo, ul, last - cneg x2, x2, cc // cc = lo, ul, last - cmn x16, #0x1 - adcs x11, x11, x12 - mul x12, x9, x2 - adcs x3, x3, x1 - adcs x15, x15, x16 - umulh x9, x9, x2 - adcs x17, x17, x16 - adc x13, x13, x16 - subs x1, x10, x5 - cinv x2, x7, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - eor x9, x9, x8 - cmn x8, #0x1 - eor x7, x12, x8 - mul x12, x4, x1 - adcs x3, x3, x7 - adcs x7, x15, x9 - adcs x15, x17, x8 - ldp x9, x17, [sp, #304] - umulh x4, x4, x1 - adc x8, x13, x8 - cmn x2, #0x1 - eor x1, x12, x2 - adcs x1, x7, x1 - ldp x7, x16, [sp, #288] - eor x12, x4, x2 - adcs x4, x15, x12 - ldp x15, x12, [sp, #320] - adc x8, x8, x2 - adds x13, x14, x14 - umulh x14, x5, x10 - adcs x2, x11, x11 - adcs x3, x3, x3 - adcs x1, x1, x1 - adcs x4, x4, x4 - adcs x11, x8, x8 - adc x8, xzr, xzr - adds x13, x13, x7 - adcs x2, x2, x16 - mul x16, x5, x10 - adcs x3, x3, x9 - adcs x1, x1, x17 - umulh x5, x5, x5 - lsl x9, x13, #32 - add x9, x9, x13 - adcs x4, x4, x15 - mov x13, v28.d[1] - adcs x15, x11, x12 - lsr x7, x9, #32 - adc x11, x8, xzr - subs x7, x7, x9 - umulh x10, x10, x10 - sbc x17, x9, xzr - extr x7, x17, x7, #32 - lsr x17, x17, #32 - adds x17, x17, x9 - adc x12, xzr, xzr - subs x8, x2, x7 - sbcs x17, x3, x17 - lsl x7, x8, #32 - sbcs x2, x1, x12 - add x3, x7, x8 - sbcs x12, x4, xzr - lsr x1, x3, #32 - sbcs x7, x15, xzr - sbc x15, x9, xzr - subs x1, x1, x3 - sbc x4, x3, xzr - lsr x9, x4, #32 - extr x8, x4, x1, #32 - adds x9, x9, x3 - adc x4, xzr, xzr - subs x1, x17, x8 - lsl x17, x1, #32 - sbcs x8, x2, x9 - sbcs x9, x12, x4 - add x17, x17, x1 - mov x1, v18.d[1] - lsr x2, x17, #32 - sbcs x7, x7, xzr - mov x12, v18.d[0] - sbcs x15, x15, xzr - sbc x3, x3, xzr - subs x4, x2, x17 - sbc x2, x17, xzr - adds x12, x13, x12 - adcs x16, x16, x1 - lsr x13, x2, #32 - extr x1, x2, x4, #32 - adc x2, x14, xzr - adds x4, x13, x17 - mul x13, x6, x6 - adc x14, xzr, xzr - subs x1, x8, x1 - sbcs x4, x9, x4 - mov x9, v28.d[0] - sbcs x7, x7, x14 - sbcs x8, x15, xzr - sbcs x3, x3, xzr - sbc x14, x17, xzr - adds x17, x9, x9 - adcs x12, x12, x12 - mov x15, v19.d[0] - adcs x9, x16, x16 - umulh x6, x6, x6 - adcs x16, x2, x2 - adc x2, xzr, xzr - adds x11, x11, x8 - adcs x3, x3, xzr - adcs x14, x14, xzr - adcs x8, xzr, xzr - adds x13, x1, x13 - mov x1, v19.d[1] - adcs x6, x4, x6 - mov x4, #0xffffffff // #4294967295 - adcs x15, x7, x15 - adcs x7, x11, x5 - adcs x1, x3, x1 - adcs x14, x14, x10 - adc x11, x8, xzr - adds x6, x6, x17 - adcs x8, x15, x12 - adcs x3, x7, x9 - adcs x15, x1, x16 - mov x16, #0xffffffff00000001 // #-4294967295 - adcs x14, x14, x2 - mov x2, #0x1 // #1 - adc x17, x11, xzr - cmn x13, x16 - adcs xzr, x6, x4 - adcs xzr, x8, x2 - adcs xzr, x3, xzr - adcs xzr, x15, xzr - adcs xzr, x14, xzr - adc x1, x17, xzr - neg x9, x1 - and x1, x16, x9 - adds x11, x13, x1 - and x13, x4, x9 - adcs x5, x6, x13 - and x1, x2, x9 - adcs x7, x8, x1 - stp x11, x5, [sp, #288] - adcs x11, x3, xzr - adcs x2, x15, xzr - stp x7, x11, [sp, #304] - adc x17, x14, xzr - stp x2, x17, [sp, #320] - ldr q3, [x26] - ldr q25, [sp, #48] - ldp x13, x23, [sp, #48] - ldp x3, x21, [x26] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [sp, #80] - ldp x8, x24, [x26, #16] - subs x6, x3, x21 - ldr q0, [x26, #32] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [sp, #64] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [sp, #80] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [x26, #32] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x26, x4, x16 - mov x4, v27.d[0] - sbcs x27, x20, x11 - sbcs x20, x9, x12 - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #160] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #176] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #160] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #176] - adds x20, x22, x26 - mul x10, x13, x14 - adcs x11, x11, x27 - eor x9, x8, x21 - adcs x26, x19, x17 - stp x20, x11, [sp, #144] - adcs x27, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #176] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #144] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #176] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x26 - eor x1, x22, x9 - adcs x24, x23, x27 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x26 - adcs x15, x17, x27 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x14, x24, x11 - stp x22, x5, [sp, #144] - adcs x5, x13, x23 - adcs x21, x8, x23 - stp x14, x5, [sp, #160] - adc x12, x15, x23 - stp x21, x12, [sp, #176] - ldr q1, [sp, #240] - ldp x9, x2, [sp, #240] - ldr q0, [sp, #240] - ldp x4, x6, [sp, #256] - rev64 v21.4s, v1.4s - uzp2 v28.4s, v1.4s, v1.4s - umulh x7, x9, x2 - xtn v17.2s, v1.2d - mul v27.4s, v21.4s, v0.4s - ldr q20, [sp, #272] - xtn v30.2s, v0.2d - ldr q1, [sp, #272] - uzp2 v31.4s, v0.4s, v0.4s - ldp x5, x10, [sp, #272] - umulh x8, x9, x4 - uaddlp v3.2d, v27.4s - umull v16.2d, v30.2s, v17.2s - mul x16, x9, x4 - umull v27.2d, v30.2s, v28.2s - shrn v0.2s, v20.2d, #32 - xtn v7.2s, v20.2d - shl v20.2d, v3.2d, #32 - umull v3.2d, v31.2s, v28.2s - mul x3, x2, x4 - umlal v20.2d, v30.2s, v17.2s - umull v22.2d, v7.2s, v0.2s - usra v27.2d, v16.2d, #32 - umulh x11, x2, x4 - movi v21.2d, #0xffffffff - uzp2 v28.4s, v1.4s, v1.4s - adds x15, x16, x7 - and v5.16b, v27.16b, v21.16b - adcs x3, x3, x8 - usra v3.2d, v27.2d, #32 - dup v29.2d, x6 - adcs x16, x11, xzr - mov x14, v20.d[0] - umlal v5.2d, v31.2s, v17.2s - mul x8, x9, x2 - mov x7, v20.d[1] - shl v19.2d, v22.2d, #33 - xtn v25.2s, v29.2d - rev64 v31.4s, v1.4s - lsl x13, x14, #32 - uzp2 v6.4s, v29.4s, v29.4s - umlal v19.2d, v7.2s, v7.2s - usra v3.2d, v5.2d, #32 - adds x1, x8, x8 - umulh x8, x4, x4 - add x12, x13, x14 - mul v17.4s, v31.4s, v29.4s - xtn v4.2s, v1.2d - adcs x14, x15, x15 - lsr x13, x12, #32 - adcs x15, x3, x3 - umull v31.2d, v25.2s, v28.2s - adcs x11, x16, x16 - umull v21.2d, v25.2s, v4.2s - mov x17, v3.d[0] - umull v18.2d, v6.2s, v28.2s - adc x16, x8, xzr - uaddlp v16.2d, v17.4s - movi v1.2d, #0xffffffff - subs x13, x13, x12 - usra v31.2d, v21.2d, #32 - sbc x8, x12, xzr - adds x17, x17, x1 - mul x1, x4, x4 - shl v28.2d, v16.2d, #32 - mov x3, v3.d[1] - adcs x14, x7, x14 - extr x7, x8, x13, #32 - adcs x13, x3, x15 - and v3.16b, v31.16b, v1.16b - adcs x11, x1, x11 - lsr x1, x8, #32 - umlal v3.2d, v6.2s, v4.2s - usra v18.2d, v31.2d, #32 - adc x3, x16, xzr - adds x1, x1, x12 - umlal v28.2d, v25.2s, v4.2s - adc x16, xzr, xzr - subs x15, x17, x7 - sbcs x7, x14, x1 - lsl x1, x15, #32 - sbcs x16, x13, x16 - add x8, x1, x15 - usra v18.2d, v3.2d, #32 - sbcs x14, x11, xzr - lsr x1, x8, #32 - sbcs x17, x3, xzr - sbc x11, x12, xzr - subs x13, x1, x8 - umulh x12, x4, x10 - sbc x1, x8, xzr - extr x13, x1, x13, #32 - lsr x1, x1, #32 - adds x15, x1, x8 - adc x1, xzr, xzr - subs x7, x7, x13 - sbcs x13, x16, x15 - lsl x3, x7, #32 - umulh x16, x2, x5 - sbcs x15, x14, x1 - add x7, x3, x7 - sbcs x3, x17, xzr - lsr x1, x7, #32 - sbcs x14, x11, xzr - sbc x11, x8, xzr - subs x8, x1, x7 - sbc x1, x7, xzr - extr x8, x1, x8, #32 - lsr x1, x1, #32 - adds x1, x1, x7 - adc x17, xzr, xzr - subs x13, x13, x8 - umulh x8, x9, x6 - sbcs x1, x15, x1 - sbcs x19, x3, x17 - sbcs x20, x14, xzr - mul x17, x2, x5 - sbcs x11, x11, xzr - stp x13, x1, [sp, #192] - sbc x14, x7, xzr - mul x7, x4, x10 - subs x1, x9, x2 - csetm x15, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - stp x11, x14, [sp, #224] - mul x14, x9, x6 - adds x17, x8, x17 - adcs x7, x16, x7 - adc x13, x12, xzr - subs x12, x5, x6 - cneg x3, x12, cc // cc = lo, ul, last - cinv x16, x15, cc // cc = lo, ul, last - mul x8, x1, x3 - umulh x1, x1, x3 - eor x12, x8, x16 - adds x11, x17, x14 - adcs x3, x7, x17 - adcs x15, x13, x7 - adc x8, x13, xzr - adds x3, x3, x14 - adcs x15, x15, x17 - adcs x17, x8, x7 - eor x1, x1, x16 - adc x13, x13, xzr - subs x9, x9, x4 - csetm x8, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x4, x2, x4 - cneg x4, x4, cc // cc = lo, ul, last - csetm x7, cc // cc = lo, ul, last - subs x2, x10, x6 - cinv x8, x8, cc // cc = lo, ul, last - cneg x2, x2, cc // cc = lo, ul, last - cmn x16, #0x1 - adcs x11, x11, x12 - mul x12, x9, x2 - adcs x3, x3, x1 - adcs x15, x15, x16 - umulh x9, x9, x2 - adcs x17, x17, x16 - adc x13, x13, x16 - subs x1, x10, x5 - cinv x2, x7, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - eor x9, x9, x8 - cmn x8, #0x1 - eor x7, x12, x8 - mul x12, x4, x1 - adcs x3, x3, x7 - adcs x7, x15, x9 - adcs x15, x17, x8 - umulh x4, x4, x1 - adc x8, x13, x8 - cmn x2, #0x1 - eor x1, x12, x2 - adcs x1, x7, x1 - ldp x7, x16, [sp, #192] - eor x12, x4, x2 - adcs x4, x15, x12 - ldp x15, x12, [sp, #224] - adc x8, x8, x2 - adds x13, x14, x14 - umulh x14, x5, x10 - adcs x2, x11, x11 - adcs x3, x3, x3 - adcs x1, x1, x1 - adcs x4, x4, x4 - adcs x11, x8, x8 - adc x8, xzr, xzr - adds x13, x13, x7 - adcs x2, x2, x16 - mul x16, x5, x10 - adcs x3, x3, x19 - adcs x1, x1, x20 - umulh x5, x5, x5 - lsl x9, x13, #32 - add x9, x9, x13 - adcs x4, x4, x15 - mov x13, v28.d[1] - adcs x15, x11, x12 - lsr x7, x9, #32 - adc x11, x8, xzr - subs x7, x7, x9 - umulh x10, x10, x10 - sbc x17, x9, xzr - extr x7, x17, x7, #32 - lsr x17, x17, #32 - adds x17, x17, x9 - adc x12, xzr, xzr - subs x8, x2, x7 - sbcs x17, x3, x17 - lsl x7, x8, #32 - sbcs x2, x1, x12 - add x3, x7, x8 - sbcs x12, x4, xzr - lsr x1, x3, #32 - sbcs x7, x15, xzr - sbc x15, x9, xzr - subs x1, x1, x3 - sbc x4, x3, xzr - lsr x9, x4, #32 - extr x8, x4, x1, #32 - adds x9, x9, x3 - adc x4, xzr, xzr - subs x1, x17, x8 - lsl x17, x1, #32 - sbcs x8, x2, x9 - sbcs x9, x12, x4 - add x17, x17, x1 - mov x1, v18.d[1] - lsr x2, x17, #32 - sbcs x7, x7, xzr - mov x12, v18.d[0] - sbcs x15, x15, xzr - sbc x3, x3, xzr - subs x4, x2, x17 - sbc x2, x17, xzr - adds x12, x13, x12 - adcs x16, x16, x1 - lsr x13, x2, #32 - extr x1, x2, x4, #32 - adc x2, x14, xzr - adds x4, x13, x17 - mul x13, x6, x6 - adc x14, xzr, xzr - subs x1, x8, x1 - sbcs x4, x9, x4 - mov x9, v28.d[0] - sbcs x7, x7, x14 - sbcs x8, x15, xzr - sbcs x3, x3, xzr - sbc x14, x17, xzr - adds x17, x9, x9 - adcs x12, x12, x12 - mov x15, v19.d[0] - adcs x9, x16, x16 - umulh x6, x6, x6 - adcs x16, x2, x2 - adc x2, xzr, xzr - adds x11, x11, x8 - adcs x3, x3, xzr - adcs x14, x14, xzr - adcs x8, xzr, xzr - adds x13, x1, x13 - mov x1, v19.d[1] - adcs x6, x4, x6 - mov x4, #0xffffffff // #4294967295 - adcs x15, x7, x15 - adcs x7, x11, x5 - adcs x1, x3, x1 - adcs x14, x14, x10 - adc x11, x8, xzr - adds x6, x6, x17 - adcs x8, x15, x12 - adcs x3, x7, x9 - adcs x15, x1, x16 - mov x16, #0xffffffff00000001 // #-4294967295 - adcs x14, x14, x2 - mov x2, #0x1 // #1 - adc x17, x11, xzr - cmn x13, x16 - adcs xzr, x6, x4 - adcs xzr, x8, x2 - adcs xzr, x3, xzr - adcs xzr, x15, xzr - adcs xzr, x14, xzr - adc x1, x17, xzr - neg x9, x1 - and x1, x16, x9 - adds x19, x13, x1 - and x13, x4, x9 - adcs x20, x6, x13 - and x1, x2, x9 - adcs x7, x8, x1 - adcs x11, x3, xzr - adcs x2, x15, xzr - stp x7, x11, [sp, #208] - adc x17, x14, xzr - stp x2, x17, [sp, #224] - ldp x0, x1, [sp, #288] - mov x6, #0xffffffff // #4294967295 - subs x6, x6, x0 - mov x7, #0xffffffff00000000 // #-4294967296 - sbcs x7, x7, x1 - ldp x0, x1, [sp, #304] - mov x8, #0xfffffffffffffffe // #-2 - sbcs x8, x8, x0 - mov x13, #0xffffffffffffffff // #-1 - sbcs x9, x13, x1 - ldp x0, x1, [sp, #320] - sbcs x10, x13, x0 - sbc x11, x13, x1 - mov x12, #0x9 // #9 - mul x0, x12, x6 - mul x1, x12, x7 - mul x2, x12, x8 - mul x3, x12, x9 - mul x4, x12, x10 - mul x5, x12, x11 - umulh x6, x12, x6 - umulh x7, x12, x7 - umulh x8, x12, x8 - umulh x9, x12, x9 - umulh x10, x12, x10 - umulh x12, x12, x11 - adds x1, x1, x6 - adcs x2, x2, x7 - adcs x3, x3, x8 - adcs x4, x4, x9 - adcs x5, x5, x10 - mov x6, #0x1 // #1 - adc x6, x12, x6 - ldp x8, x9, [sp, #144] - ldp x10, x11, [sp, #160] - ldp x12, x13, [sp, #176] - mov x14, #0xc // #12 - mul x15, x14, x8 - umulh x8, x14, x8 - adds x0, x0, x15 - mul x15, x14, x9 - umulh x9, x14, x9 - adcs x1, x1, x15 - mul x15, x14, x10 - umulh x10, x14, x10 - adcs x2, x2, x15 - mul x15, x14, x11 - umulh x11, x14, x11 - adcs x3, x3, x15 - mul x15, x14, x12 - umulh x12, x14, x12 - adcs x4, x4, x15 - mul x15, x14, x13 - umulh x13, x14, x13 - adcs x5, x5, x15 - adc x6, x6, xzr - adds x1, x1, x8 - adcs x2, x2, x9 - adcs x3, x3, x10 - adcs x4, x4, x11 - adcs x5, x5, x12 - adcs x6, x6, x13 - lsl x7, x6, #32 - subs x8, x6, x7 - sbc x7, x7, xzr - adds x0, x0, x8 - adcs x1, x1, x7 - adcs x2, x2, x6 - adcs x3, x3, xzr - adcs x4, x4, xzr - adcs x5, x5, xzr - csetm x6, cc // cc = lo, ul, last - mov x7, #0xffffffff // #4294967295 - and x7, x7, x6 - adds x0, x0, x7 - eor x7, x7, x6 - adcs x1, x1, x7 - mov x7, #0xfffffffffffffffe // #-2 - and x7, x7, x6 - adcs x2, x2, x7 - adcs x3, x3, x6 - adcs x4, x4, x6 - adc x5, x5, x6 - stp x0, x1, [sp, #288] - stp x2, x3, [sp, #304] - stp x4, x5, [sp, #320] - mov x2, sp - ldp x4, x3, [x2] - subs x5, x19, x4 - sbcs x6, x20, x3 - ldp x7, x8, [sp, #208] - ldp x4, x3, [x2, #16] - sbcs x7, x7, x4 - sbcs x8, x8, x3 - ldp x9, x10, [sp, #224] - ldp x4, x3, [x2, #32] - sbcs x9, x9, x4 - sbcs x10, x10, x3 - csetm x3, cc // cc = lo, ul, last - mov x4, #0xffffffff // #4294967295 - and x4, x4, x3 - adds x5, x5, x4 - eor x4, x4, x3 - adcs x6, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - and x4, x4, x3 - adcs x7, x7, x4 - adcs x8, x8, x3 - adcs x9, x9, x3 - adc x10, x10, x3 - stp x5, x6, [sp, #240] - stp x7, x8, [sp, #256] - stp x9, x10, [sp, #272] - ldr q1, [sp, #48] - ldp x9, x2, [sp, #48] - ldr q0, [sp, #48] - ldp x4, x6, [sp, #64] - rev64 v21.4s, v1.4s - uzp2 v28.4s, v1.4s, v1.4s - umulh x7, x9, x2 - xtn v17.2s, v1.2d - mul v27.4s, v21.4s, v0.4s - ldr q20, [sp, #80] - xtn v30.2s, v0.2d - ldr q1, [sp, #80] - uzp2 v31.4s, v0.4s, v0.4s - ldp x5, x10, [sp, #80] - umulh x8, x9, x4 - uaddlp v3.2d, v27.4s - umull v16.2d, v30.2s, v17.2s - mul x16, x9, x4 - umull v27.2d, v30.2s, v28.2s - shrn v0.2s, v20.2d, #32 - xtn v7.2s, v20.2d - shl v20.2d, v3.2d, #32 - umull v3.2d, v31.2s, v28.2s - mul x3, x2, x4 - umlal v20.2d, v30.2s, v17.2s - umull v22.2d, v7.2s, v0.2s - usra v27.2d, v16.2d, #32 - umulh x11, x2, x4 - movi v21.2d, #0xffffffff - uzp2 v28.4s, v1.4s, v1.4s - adds x15, x16, x7 - and v5.16b, v27.16b, v21.16b - adcs x3, x3, x8 - usra v3.2d, v27.2d, #32 - dup v29.2d, x6 - adcs x16, x11, xzr - mov x14, v20.d[0] - umlal v5.2d, v31.2s, v17.2s - mul x8, x9, x2 - mov x7, v20.d[1] - shl v19.2d, v22.2d, #33 - xtn v25.2s, v29.2d - rev64 v31.4s, v1.4s - lsl x13, x14, #32 - uzp2 v6.4s, v29.4s, v29.4s - umlal v19.2d, v7.2s, v7.2s - usra v3.2d, v5.2d, #32 - adds x1, x8, x8 - umulh x8, x4, x4 - add x12, x13, x14 - mul v17.4s, v31.4s, v29.4s - xtn v4.2s, v1.2d - adcs x14, x15, x15 - lsr x13, x12, #32 - adcs x15, x3, x3 - umull v31.2d, v25.2s, v28.2s - adcs x11, x16, x16 - umull v21.2d, v25.2s, v4.2s - mov x17, v3.d[0] - umull v18.2d, v6.2s, v28.2s - adc x16, x8, xzr - uaddlp v16.2d, v17.4s - movi v1.2d, #0xffffffff - subs x13, x13, x12 - usra v31.2d, v21.2d, #32 - sbc x8, x12, xzr - adds x17, x17, x1 - mul x1, x4, x4 - shl v28.2d, v16.2d, #32 - mov x3, v3.d[1] - adcs x14, x7, x14 - extr x7, x8, x13, #32 - adcs x13, x3, x15 - and v3.16b, v31.16b, v1.16b - adcs x11, x1, x11 - lsr x1, x8, #32 - umlal v3.2d, v6.2s, v4.2s - usra v18.2d, v31.2d, #32 - adc x3, x16, xzr - adds x1, x1, x12 - umlal v28.2d, v25.2s, v4.2s - adc x16, xzr, xzr - subs x15, x17, x7 - sbcs x7, x14, x1 - lsl x1, x15, #32 - sbcs x16, x13, x16 - add x8, x1, x15 - usra v18.2d, v3.2d, #32 - sbcs x14, x11, xzr - lsr x1, x8, #32 - sbcs x17, x3, xzr - sbc x11, x12, xzr - subs x13, x1, x8 - umulh x12, x4, x10 - sbc x1, x8, xzr - extr x13, x1, x13, #32 - lsr x1, x1, #32 - adds x15, x1, x8 - adc x1, xzr, xzr - subs x7, x7, x13 - sbcs x13, x16, x15 - lsl x3, x7, #32 - umulh x16, x2, x5 - sbcs x15, x14, x1 - add x7, x3, x7 - sbcs x3, x17, xzr - lsr x1, x7, #32 - sbcs x14, x11, xzr - sbc x11, x8, xzr - subs x8, x1, x7 - sbc x1, x7, xzr - extr x8, x1, x8, #32 - lsr x1, x1, #32 - adds x1, x1, x7 - adc x17, xzr, xzr - subs x13, x13, x8 - umulh x8, x9, x6 - sbcs x1, x15, x1 - sbcs x19, x3, x17 - sbcs x20, x14, xzr - mul x17, x2, x5 - sbcs x11, x11, xzr - stp x13, x1, [sp, #192] - sbc x14, x7, xzr - mul x7, x4, x10 - subs x1, x9, x2 - csetm x15, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - stp x11, x14, [sp, #224] - mul x14, x9, x6 - adds x17, x8, x17 - adcs x7, x16, x7 - adc x13, x12, xzr - subs x12, x5, x6 - cneg x3, x12, cc // cc = lo, ul, last - cinv x16, x15, cc // cc = lo, ul, last - mul x8, x1, x3 - umulh x1, x1, x3 - eor x12, x8, x16 - adds x11, x17, x14 - adcs x3, x7, x17 - adcs x15, x13, x7 - adc x8, x13, xzr - adds x3, x3, x14 - adcs x15, x15, x17 - adcs x17, x8, x7 - eor x1, x1, x16 - adc x13, x13, xzr - subs x9, x9, x4 - csetm x8, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x4, x2, x4 - cneg x4, x4, cc // cc = lo, ul, last - csetm x7, cc // cc = lo, ul, last - subs x2, x10, x6 - cinv x8, x8, cc // cc = lo, ul, last - cneg x2, x2, cc // cc = lo, ul, last - cmn x16, #0x1 - adcs x11, x11, x12 - mul x12, x9, x2 - adcs x3, x3, x1 - adcs x15, x15, x16 - umulh x9, x9, x2 - adcs x17, x17, x16 - adc x13, x13, x16 - subs x1, x10, x5 - cinv x2, x7, cc // cc = lo, ul, last - cneg x1, x1, cc // cc = lo, ul, last - eor x9, x9, x8 - cmn x8, #0x1 - eor x7, x12, x8 - mul x12, x4, x1 - adcs x3, x3, x7 - adcs x7, x15, x9 - adcs x15, x17, x8 - umulh x4, x4, x1 - adc x8, x13, x8 - cmn x2, #0x1 - eor x1, x12, x2 - adcs x1, x7, x1 - ldp x7, x16, [sp, #192] - eor x12, x4, x2 - adcs x4, x15, x12 - ldp x15, x12, [sp, #224] - adc x8, x8, x2 - adds x13, x14, x14 - umulh x14, x5, x10 - adcs x2, x11, x11 - adcs x3, x3, x3 - adcs x1, x1, x1 - adcs x4, x4, x4 - adcs x11, x8, x8 - adc x8, xzr, xzr - adds x13, x13, x7 - adcs x2, x2, x16 - mul x16, x5, x10 - adcs x3, x3, x19 - adcs x1, x1, x20 - umulh x5, x5, x5 - lsl x9, x13, #32 - add x9, x9, x13 - adcs x4, x4, x15 - mov x13, v28.d[1] - adcs x15, x11, x12 - lsr x7, x9, #32 - adc x11, x8, xzr - subs x7, x7, x9 - umulh x10, x10, x10 - sbc x17, x9, xzr - extr x7, x17, x7, #32 - lsr x17, x17, #32 - adds x17, x17, x9 - adc x12, xzr, xzr - subs x8, x2, x7 - sbcs x17, x3, x17 - lsl x7, x8, #32 - sbcs x2, x1, x12 - add x3, x7, x8 - sbcs x12, x4, xzr - lsr x1, x3, #32 - sbcs x7, x15, xzr - sbc x15, x9, xzr - subs x1, x1, x3 - sbc x4, x3, xzr - lsr x9, x4, #32 - extr x8, x4, x1, #32 - adds x9, x9, x3 - adc x4, xzr, xzr - subs x1, x17, x8 - lsl x17, x1, #32 - sbcs x8, x2, x9 - sbcs x9, x12, x4 - add x17, x17, x1 - mov x1, v18.d[1] - lsr x2, x17, #32 - sbcs x7, x7, xzr - mov x12, v18.d[0] - sbcs x15, x15, xzr - sbc x3, x3, xzr - subs x4, x2, x17 - sbc x2, x17, xzr - adds x12, x13, x12 - adcs x16, x16, x1 - lsr x13, x2, #32 - extr x1, x2, x4, #32 - adc x2, x14, xzr - adds x4, x13, x17 - mul x13, x6, x6 - adc x14, xzr, xzr - subs x1, x8, x1 - sbcs x4, x9, x4 - mov x9, v28.d[0] - sbcs x7, x7, x14 - sbcs x8, x15, xzr - sbcs x3, x3, xzr - sbc x14, x17, xzr - adds x17, x9, x9 - adcs x12, x12, x12 - mov x15, v19.d[0] - adcs x9, x16, x16 - umulh x6, x6, x6 - adcs x16, x2, x2 - adc x2, xzr, xzr - adds x11, x11, x8 - adcs x3, x3, xzr - adcs x14, x14, xzr - adcs x8, xzr, xzr - adds x13, x1, x13 - mov x1, v19.d[1] - adcs x6, x4, x6 - mov x4, #0xffffffff // #4294967295 - adcs x15, x7, x15 - adcs x7, x11, x5 - adcs x1, x3, x1 - adcs x14, x14, x10 - adc x11, x8, xzr - adds x6, x6, x17 - adcs x8, x15, x12 - adcs x3, x7, x9 - adcs x15, x1, x16 - mov x16, #0xffffffff00000001 // #-4294967295 - adcs x14, x14, x2 - mov x2, #0x1 // #1 - adc x17, x11, xzr - cmn x13, x16 - adcs xzr, x6, x4 - adcs xzr, x8, x2 - adcs xzr, x3, xzr - adcs xzr, x15, xzr - adcs xzr, x14, xzr - adc x1, x17, xzr - neg x9, x1 - and x1, x16, x9 - adds x11, x13, x1 - and x13, x4, x9 - adcs x5, x6, x13 - and x1, x2, x9 - adcs x7, x8, x1 - stp x11, x5, [sp, #192] - adcs x11, x3, xzr - adcs x2, x15, xzr - stp x7, x11, [sp, #208] - adc x17, x14, xzr - stp x2, x17, [sp, #224] - ldp x5, x6, [sp, #240] - ldp x4, x3, [sp, #48] - subs x5, x5, x4 - sbcs x6, x6, x3 - ldp x7, x8, [sp, #256] - ldp x4, x3, [sp, #64] - sbcs x7, x7, x4 - sbcs x8, x8, x3 - ldp x9, x10, [sp, #272] - ldp x4, x3, [sp, #80] - sbcs x9, x9, x4 - sbcs x10, x10, x3 - csetm x3, cc // cc = lo, ul, last - mov x4, #0xffffffff // #4294967295 - and x4, x4, x3 - adds x5, x5, x4 - eor x4, x4, x3 - adcs x6, x6, x4 - mov x4, #0xfffffffffffffffe // #-2 - and x4, x4, x3 - adcs x7, x7, x4 - adcs x8, x8, x3 - adcs x9, x9, x3 - adc x10, x10, x3 - stp x5, x6, [x25, #96] - stp x7, x8, [x25, #112] - stp x9, x10, [x25, #128] - ldr q3, [sp, #288] - ldr q25, [sp, #96] - ldp x13, x23, [sp, #96] - ldp x3, x21, [sp, #288] - rev64 v23.4s, v25.4s - uzp1 v17.4s, v25.4s, v3.4s - umulh x15, x3, x13 - mul v6.4s, v23.4s, v3.4s - uzp1 v3.4s, v3.4s, v3.4s - ldr q27, [sp, #128] - ldp x8, x24, [sp, #304] - subs x6, x3, x21 - ldr q0, [sp, #320] - movi v23.2d, #0xffffffff - csetm x10, cc // cc = lo, ul, last - umulh x19, x21, x23 - rev64 v4.4s, v27.4s - uzp2 v25.4s, v27.4s, v27.4s - cneg x4, x6, cc // cc = lo, ul, last - subs x7, x23, x13 - xtn v22.2s, v0.2d - xtn v24.2s, v27.2d - cneg x20, x7, cc // cc = lo, ul, last - ldp x6, x14, [sp, #112] - mul v27.4s, v4.4s, v0.4s - uaddlp v20.2d, v6.4s - cinv x5, x10, cc // cc = lo, ul, last - mul x16, x4, x20 - uzp2 v6.4s, v0.4s, v0.4s - umull v21.2d, v22.2s, v25.2s - shl v0.2d, v20.2d, #32 - umlal v0.2d, v3.2s, v17.2s - mul x22, x8, x6 - umull v1.2d, v6.2s, v25.2s - subs x12, x3, x8 - umull v20.2d, v22.2s, v24.2s - cneg x17, x12, cc // cc = lo, ul, last - umulh x9, x8, x6 - mov x12, v0.d[1] - eor x11, x16, x5 - mov x7, v0.d[0] - csetm x10, cc // cc = lo, ul, last - usra v21.2d, v20.2d, #32 - adds x15, x15, x12 - adcs x12, x19, x22 - umulh x20, x4, x20 - adc x19, x9, xzr - usra v1.2d, v21.2d, #32 - adds x22, x15, x7 - and v26.16b, v21.16b, v23.16b - adcs x16, x12, x15 - uaddlp v25.2d, v27.4s - adcs x9, x19, x12 - umlal v26.2d, v6.2s, v24.2s - adc x4, x19, xzr - adds x16, x16, x7 - shl v27.2d, v25.2d, #32 - adcs x9, x9, x15 - adcs x4, x4, x12 - eor x12, x20, x5 - adc x15, x19, xzr - subs x20, x6, x13 - cneg x20, x20, cc // cc = lo, ul, last - cinv x10, x10, cc // cc = lo, ul, last - cmn x5, #0x1 - mul x19, x17, x20 - adcs x11, x22, x11 - adcs x12, x16, x12 - adcs x9, x9, x5 - umulh x17, x17, x20 - adcs x22, x4, x5 - adc x5, x15, x5 - subs x16, x21, x8 - cneg x20, x16, cc // cc = lo, ul, last - eor x19, x19, x10 - csetm x4, cc // cc = lo, ul, last - subs x16, x6, x23 - cneg x16, x16, cc // cc = lo, ul, last - umlal v27.2d, v22.2s, v24.2s - mul x15, x20, x16 - cinv x4, x4, cc // cc = lo, ul, last - cmn x10, #0x1 - usra v1.2d, v26.2d, #32 - adcs x19, x12, x19 - eor x17, x17, x10 - adcs x9, x9, x17 - adcs x22, x22, x10 - lsl x12, x7, #32 - umulh x20, x20, x16 - eor x16, x15, x4 - ldp x15, x17, [sp, #128] - add x2, x12, x7 - adc x7, x5, x10 - ldp x5, x10, [sp, #320] - lsr x1, x2, #32 - eor x12, x20, x4 - subs x1, x1, x2 - sbc x20, x2, xzr - cmn x4, #0x1 - adcs x9, x9, x16 - extr x1, x20, x1, #32 - lsr x20, x20, #32 - adcs x22, x22, x12 - adc x16, x7, x4 - adds x12, x20, x2 - umulh x7, x24, x14 - adc x4, xzr, xzr - subs x1, x11, x1 - sbcs x20, x19, x12 - sbcs x12, x9, x4 - lsl x9, x1, #32 - add x1, x9, x1 - sbcs x9, x22, xzr - mul x22, x24, x14 - sbcs x16, x16, xzr - lsr x4, x1, #32 - sbc x19, x2, xzr - subs x4, x4, x1 - sbc x11, x1, xzr - extr x2, x11, x4, #32 - lsr x4, x11, #32 - adds x4, x4, x1 - adc x11, xzr, xzr - subs x2, x20, x2 - sbcs x4, x12, x4 - sbcs x20, x9, x11 - lsl x12, x2, #32 - add x2, x12, x2 - sbcs x9, x16, xzr - lsr x11, x2, #32 - sbcs x19, x19, xzr - sbc x1, x1, xzr - subs x16, x11, x2 - sbc x12, x2, xzr - extr x16, x12, x16, #32 - lsr x12, x12, #32 - adds x11, x12, x2 - adc x12, xzr, xzr - subs x26, x4, x16 - mov x4, v27.d[0] - sbcs x27, x20, x11 - sbcs x20, x9, x12 - sbcs x11, x19, xzr - sbcs x9, x1, xzr - stp x20, x11, [sp, #256] - mov x1, v1.d[0] - sbc x20, x2, xzr - subs x12, x24, x5 - mov x11, v27.d[1] - cneg x16, x12, cc // cc = lo, ul, last - csetm x2, cc // cc = lo, ul, last - subs x19, x15, x14 - mov x12, v1.d[1] - cinv x2, x2, cc // cc = lo, ul, last - cneg x19, x19, cc // cc = lo, ul, last - stp x9, x20, [sp, #272] - mul x9, x16, x19 - adds x4, x7, x4 - adcs x11, x1, x11 - adc x1, x12, xzr - adds x20, x4, x22 - umulh x19, x16, x19 - adcs x7, x11, x4 - eor x16, x9, x2 - adcs x9, x1, x11 - adc x12, x1, xzr - adds x7, x7, x22 - adcs x4, x9, x4 - adcs x9, x12, x11 - adc x12, x1, xzr - cmn x2, #0x1 - eor x1, x19, x2 - adcs x11, x20, x16 - adcs x19, x7, x1 - adcs x1, x4, x2 - adcs x20, x9, x2 - adc x2, x12, x2 - subs x12, x24, x10 - cneg x16, x12, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x9, x17, x14 - cinv x12, x12, cc // cc = lo, ul, last - cneg x9, x9, cc // cc = lo, ul, last - subs x3, x24, x3 - sbcs x21, x5, x21 - mul x24, x16, x9 - sbcs x4, x10, x8 - ngc x8, xzr - subs x10, x5, x10 - eor x5, x24, x12 - csetm x7, cc // cc = lo, ul, last - cneg x24, x10, cc // cc = lo, ul, last - subs x10, x17, x15 - cinv x7, x7, cc // cc = lo, ul, last - cneg x10, x10, cc // cc = lo, ul, last - subs x14, x13, x14 - sbcs x15, x23, x15 - eor x13, x21, x8 - mul x23, x24, x10 - sbcs x17, x6, x17 - eor x6, x3, x8 - ngc x21, xzr - umulh x9, x16, x9 - cmn x8, #0x1 - eor x3, x23, x7 - adcs x23, x6, xzr - adcs x13, x13, xzr - eor x16, x4, x8 - adc x16, x16, xzr - eor x4, x17, x21 - umulh x17, x24, x10 - cmn x21, #0x1 - eor x24, x14, x21 - eor x6, x15, x21 - adcs x15, x24, xzr - adcs x14, x6, xzr - adc x6, x4, xzr - cmn x12, #0x1 - eor x4, x9, x12 - adcs x19, x19, x5 - umulh x5, x23, x15 - adcs x1, x1, x4 - adcs x10, x20, x12 - eor x4, x17, x7 - adc x2, x2, x12 - cmn x7, #0x1 - adcs x12, x1, x3 - ldp x17, x24, [sp, #256] - mul x1, x16, x6 - adcs x3, x10, x4 - adc x2, x2, x7 - ldp x7, x4, [sp, #272] - adds x20, x22, x26 - mul x10, x13, x14 - adcs x11, x11, x27 - eor x9, x8, x21 - adcs x26, x19, x17 - stp x20, x11, [sp, #240] - adcs x27, x12, x24 - mul x8, x23, x15 - adcs x3, x3, x7 - adcs x12, x2, x4 - adc x19, xzr, xzr - subs x21, x23, x16 - umulh x2, x16, x6 - stp x3, x12, [sp, #272] - cneg x3, x21, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - umulh x11, x13, x14 - subs x21, x13, x16 - eor x7, x8, x9 - cneg x17, x21, cc // cc = lo, ul, last - csetm x16, cc // cc = lo, ul, last - subs x21, x6, x15 - cneg x22, x21, cc // cc = lo, ul, last - cinv x21, x24, cc // cc = lo, ul, last - subs x20, x23, x13 - umulh x12, x3, x22 - cneg x23, x20, cc // cc = lo, ul, last - csetm x24, cc // cc = lo, ul, last - subs x20, x14, x15 - cinv x24, x24, cc // cc = lo, ul, last - mul x22, x3, x22 - cneg x3, x20, cc // cc = lo, ul, last - subs x13, x6, x14 - cneg x20, x13, cc // cc = lo, ul, last - cinv x15, x16, cc // cc = lo, ul, last - adds x13, x5, x10 - mul x4, x23, x3 - adcs x11, x11, x1 - adc x14, x2, xzr - adds x5, x13, x8 - adcs x16, x11, x13 - umulh x23, x23, x3 - adcs x3, x14, x11 - adc x1, x14, xzr - adds x10, x16, x8 - adcs x6, x3, x13 - adcs x8, x1, x11 - umulh x13, x17, x20 - eor x1, x4, x24 - adc x4, x14, xzr - cmn x24, #0x1 - adcs x1, x5, x1 - eor x16, x23, x24 - eor x11, x1, x9 - adcs x23, x10, x16 - eor x2, x22, x21 - adcs x3, x6, x24 - mul x14, x17, x20 - eor x17, x13, x15 - adcs x13, x8, x24 - adc x8, x4, x24 - cmn x21, #0x1 - adcs x6, x23, x2 - mov x16, #0xfffffffffffffffe // #-2 - eor x20, x12, x21 - adcs x20, x3, x20 - eor x23, x14, x15 - adcs x2, x13, x21 - adc x8, x8, x21 - cmn x15, #0x1 - ldp x5, x4, [sp, #240] - adcs x22, x20, x23 - eor x23, x22, x9 - adcs x17, x2, x17 - adc x22, x8, x15 - cmn x9, #0x1 - adcs x15, x7, x5 - ldp x10, x14, [sp, #272] - eor x1, x6, x9 - lsl x2, x15, #32 - adcs x8, x11, x4 - adcs x13, x1, x26 - eor x1, x22, x9 - adcs x24, x23, x27 - eor x11, x17, x9 - adcs x23, x11, x10 - adcs x7, x1, x14 - adcs x17, x9, x19 - adcs x20, x9, xzr - add x1, x2, x15 - lsr x3, x1, #32 - adcs x11, x9, xzr - adc x9, x9, xzr - subs x3, x3, x1 - sbc x6, x1, xzr - adds x24, x24, x5 - adcs x4, x23, x4 - extr x3, x6, x3, #32 - lsr x6, x6, #32 - adcs x21, x7, x26 - adcs x15, x17, x27 - adcs x7, x20, x10 - adcs x20, x11, x14 - mov x14, #0xffffffff // #4294967295 - adc x22, x9, x19 - adds x12, x6, x1 - adc x10, xzr, xzr - subs x3, x8, x3 - sbcs x12, x13, x12 - lsl x9, x3, #32 - add x3, x9, x3 - sbcs x10, x24, x10 - sbcs x24, x4, xzr - lsr x9, x3, #32 - sbcs x21, x21, xzr - sbc x1, x1, xzr - subs x9, x9, x3 - sbc x13, x3, xzr - extr x9, x13, x9, #32 - lsr x13, x13, #32 - adds x13, x13, x3 - adc x6, xzr, xzr - subs x12, x12, x9 - sbcs x17, x10, x13 - lsl x2, x12, #32 - sbcs x10, x24, x6 - add x9, x2, x12 - sbcs x6, x21, xzr - lsr x5, x9, #32 - sbcs x21, x1, xzr - sbc x13, x3, xzr - subs x8, x5, x9 - sbc x19, x9, xzr - lsr x12, x19, #32 - extr x3, x19, x8, #32 - adds x8, x12, x9 - adc x1, xzr, xzr - subs x2, x17, x3 - sbcs x12, x10, x8 - sbcs x5, x6, x1 - sbcs x3, x21, xzr - sbcs x19, x13, xzr - sbc x24, x9, xzr - adds x23, x15, x3 - adcs x8, x7, x19 - adcs x11, x20, x24 - adc x9, x22, xzr - add x24, x9, #0x1 - lsl x7, x24, #32 - subs x21, x24, x7 - sbc x10, x7, xzr - adds x6, x2, x21 - adcs x7, x12, x10 - adcs x24, x5, x24 - adcs x13, x23, xzr - adcs x8, x8, xzr - adcs x15, x11, xzr - csetm x23, cc // cc = lo, ul, last - and x11, x16, x23 - and x20, x14, x23 - adds x22, x6, x20 - eor x3, x20, x23 - adcs x5, x7, x3 - adcs x14, x24, x11 - stp x22, x5, [sp, #240] - adcs x5, x13, x23 - adcs x12, x8, x23 - stp x14, x5, [sp, #256] - adc x19, x15, x23 - ldp x1, x2, [sp, #144] - ldp x3, x4, [sp, #160] - ldp x5, x6, [sp, #176] - lsl x0, x1, #2 - ldp x7, x8, [sp, #288] - subs x0, x0, x7 - extr x1, x2, x1, #62 - sbcs x1, x1, x8 - ldp x7, x8, [sp, #304] - extr x2, x3, x2, #62 - sbcs x2, x2, x7 - extr x3, x4, x3, #62 - sbcs x3, x3, x8 - extr x4, x5, x4, #62 - ldp x7, x8, [sp, #320] - sbcs x4, x4, x7 - extr x5, x6, x5, #62 - sbcs x5, x5, x8 - lsr x6, x6, #62 - adc x6, x6, xzr - lsl x7, x6, #32 - subs x8, x6, x7 - sbc x7, x7, xzr - adds x0, x0, x8 - adcs x1, x1, x7 - adcs x2, x2, x6 - adcs x3, x3, xzr - adcs x4, x4, xzr - adcs x5, x5, xzr - csetm x8, cc // cc = lo, ul, last - mov x9, #0xffffffff // #4294967295 - and x9, x9, x8 - adds x0, x0, x9 - eor x9, x9, x8 - adcs x1, x1, x9 - mov x9, #0xfffffffffffffffe // #-2 - and x9, x9, x8 - adcs x2, x2, x9 - adcs x3, x3, x8 - adcs x4, x4, x8 - adc x5, x5, x8 - stp x0, x1, [x25] - stp x2, x3, [x25, #16] - stp x4, x5, [x25, #32] - ldp x0, x1, [sp, #192] - mov x6, #0xffffffff // #4294967295 - subs x6, x6, x0 - mov x7, #0xffffffff00000000 // #-4294967296 - sbcs x7, x7, x1 - ldp x0, x1, [sp, #208] - mov x8, #0xfffffffffffffffe // #-2 - sbcs x8, x8, x0 - mov x13, #0xffffffffffffffff // #-1 - sbcs x9, x13, x1 - ldp x0, x1, [sp, #224] - sbcs x10, x13, x0 - sbc x11, x13, x1 - lsl x0, x6, #3 - extr x1, x7, x6, #61 - extr x2, x8, x7, #61 - extr x3, x9, x8, #61 - extr x4, x10, x9, #61 - extr x5, x11, x10, #61 - lsr x6, x11, #61 - add x6, x6, #0x1 - ldp x8, x9, [sp, #240] - ldp x10, x11, [sp, #256] - mov x14, #0x3 // #3 - mul x15, x14, x8 - umulh x8, x14, x8 - adds x0, x0, x15 - mul x15, x14, x9 - umulh x9, x14, x9 - adcs x1, x1, x15 - mul x15, x14, x10 - umulh x10, x14, x10 - adcs x2, x2, x15 - mul x15, x14, x11 - umulh x11, x14, x11 - adcs x3, x3, x15 - mul x15, x14, x12 - umulh x12, x14, x12 - adcs x4, x4, x15 - mul x15, x14, x19 - umulh x13, x14, x19 - adcs x5, x5, x15 - adc x6, x6, xzr - adds x1, x1, x8 - adcs x2, x2, x9 - adcs x3, x3, x10 - adcs x4, x4, x11 - adcs x5, x5, x12 - adcs x6, x6, x13 - lsl x7, x6, #32 - subs x8, x6, x7 - sbc x7, x7, xzr - adds x0, x0, x8 - adcs x1, x1, x7 - adcs x2, x2, x6 - adcs x3, x3, xzr - adcs x4, x4, xzr - adcs x5, x5, xzr - csetm x6, cc // cc = lo, ul, last - mov x7, #0xffffffff // #4294967295 - and x7, x7, x6 - adds x0, x0, x7 - eor x7, x7, x6 - adcs x1, x1, x7 - mov x7, #0xfffffffffffffffe // #-2 - and x7, x7, x6 - adcs x2, x2, x7 - adcs x3, x3, x6 - adcs x4, x4, x6 - adc x5, x5, x6 - stp x0, x1, [x25, #48] - stp x2, x3, [x25, #64] - stp x4, x5, [x25, #80] - ldp x19, x20, [sp, #336] - ldp x21, x22, [sp, #352] - ldp x23, x24, [sp, #368] - ldp x25, x26, [sp, #384] - ldp x27, xzr, [sp, #400] - add sp, sp, #0x1a0 - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits -#endif diff --git a/third_party/s2n-bignum/arm/p521/bignum_inv_p521.S b/third_party/s2n-bignum/arm/p521/bignum_inv_p521.S deleted file mode 100644 index 7db741647d6..00000000000 --- a/third_party/s2n-bignum/arm/p521/bignum_inv_p521.S +++ /dev/null @@ -1,1696 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Modular inverse modulo p_521 = 2^521 - 1 -// Input x[9]; output z[9] -// -// extern void bignum_inv_p521(uint64_t z[static 9],uint64_t x[static 9]); -// -// Assuming the 9-digit input x is coprime to p_521, i.e. is not divisible -// by it, returns z < p_521 such that x * z == 1 (mod p_521). Note that -// x does not need to be reduced modulo p_521, but the output always is. -// -// Standard ARM ABI: X0 = z, X1 = x -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p521) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p521) - - .text - .balign 4 - -// Size in bytes of a 64-bit word - -#define N 8 - -// Used for the return pointer - -#define res x20 - -// Loop counter and d = 2 * delta value for divstep - -#define i x21 -#define d x22 - -// Registers used for matrix element magnitudes and signs - -#define m00 x10 -#define m01 x11 -#define m10 x12 -#define m11 x13 -#define s00 x14 -#define s01 x15 -#define s10 x16 -#define s11 x17 - -// Initial carries for combinations - -#define car0 x9 -#define car1 x19 - -// Input and output, plain registers treated according to pattern - -#define reg0 x0, #0 -#define reg1 x1, #0 -#define reg2 x2, #0 -#define reg3 x3, #0 -#define reg4 x4, #0 - -#define x x1, #0 -#define z x0, #0 - -// Pointer-offset pairs for temporaries on stack - -#define f sp, #0 -#define g sp, #(9*N) -#define u sp, #(18*N) -#define v sp, #(27*N) - -// Total size to reserve on the stack - -#define NSPACE #(36*N) - -// Very similar to a subroutine call to the s2n-bignum word_divstep59. -// But different in register usage and returning the final matrix in -// registers as follows -// -// [ m00 m01] -// [ m10 m11] - -#define divstep59() \ - and x4, x2, #0xfffff; \ - orr x4, x4, #0xfffffe0000000000; \ - and x5, x3, #0xfffff; \ - orr x5, x5, #0xc000000000000000; \ - tst x5, #0x1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - asr x5, x5, #1; \ - add x8, x4, #0x100, lsl #12; \ - sbfx x8, x8, #21, #21; \ - mov x11, #0x100000; \ - add x11, x11, x11, lsl #21; \ - add x9, x4, x11; \ - asr x9, x9, #42; \ - add x10, x5, #0x100, lsl #12; \ - sbfx x10, x10, #21, #21; \ - add x11, x5, x11; \ - asr x11, x11, #42; \ - mul x6, x8, x2; \ - mul x7, x9, x3; \ - mul x2, x10, x2; \ - mul x3, x11, x3; \ - add x4, x6, x7; \ - add x5, x2, x3; \ - asr x2, x4, #20; \ - asr x3, x5, #20; \ - and x4, x2, #0xfffff; \ - orr x4, x4, #0xfffffe0000000000; \ - and x5, x3, #0xfffff; \ - orr x5, x5, #0xc000000000000000; \ - tst x5, #0x1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - asr x5, x5, #1; \ - add x12, x4, #0x100, lsl #12; \ - sbfx x12, x12, #21, #21; \ - mov x15, #0x100000; \ - add x15, x15, x15, lsl #21; \ - add x13, x4, x15; \ - asr x13, x13, #42; \ - add x14, x5, #0x100, lsl #12; \ - sbfx x14, x14, #21, #21; \ - add x15, x5, x15; \ - asr x15, x15, #42; \ - mul x6, x12, x2; \ - mul x7, x13, x3; \ - mul x2, x14, x2; \ - mul x3, x15, x3; \ - add x4, x6, x7; \ - add x5, x2, x3; \ - asr x2, x4, #20; \ - asr x3, x5, #20; \ - and x4, x2, #0xfffff; \ - orr x4, x4, #0xfffffe0000000000; \ - and x5, x3, #0xfffff; \ - orr x5, x5, #0xc000000000000000; \ - tst x5, #0x1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - mul x2, x12, x8; \ - mul x3, x12, x9; \ - mul x6, x14, x8; \ - mul x7, x14, x9; \ - madd x8, x13, x10, x2; \ - madd x9, x13, x11, x3; \ - madd x16, x15, x10, x6; \ - madd x17, x15, x11, x7; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - tst x5, #0x2; \ - asr x5, x5, #1; \ - csel x6, x4, xzr, ne; \ - ccmp x1, xzr, #0x8, ne; \ - cneg x1, x1, ge; \ - cneg x6, x6, ge; \ - csel x4, x5, x4, ge; \ - add x5, x5, x6; \ - add x1, x1, #0x2; \ - asr x5, x5, #1; \ - add x12, x4, #0x100, lsl #12; \ - sbfx x12, x12, #22, #21; \ - mov x15, #0x100000; \ - add x15, x15, x15, lsl #21; \ - add x13, x4, x15; \ - asr x13, x13, #43; \ - add x14, x5, #0x100, lsl #12; \ - sbfx x14, x14, #22, #21; \ - add x15, x5, x15; \ - asr x15, x15, #43; \ - mneg x2, x12, x8; \ - mneg x3, x12, x9; \ - mneg x4, x14, x8; \ - mneg x5, x14, x9; \ - msub m00, x13, x16, x2; \ - msub m01, x13, x17, x3; \ - msub m10, x15, x16, x4; \ - msub m11, x15, x17, x5 - -// Loading large constants - -#define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ - movk nn, n3, lsl #48 - -S2N_BN_SYMBOL(bignum_inv_p521): - -// Save registers and make room for temporaries - - stp x19, x20, [sp, -16]! - stp x21, x22, [sp, -16]! - sub sp, sp, NSPACE - -// Save the return pointer for the end so we can overwrite x0 later - - mov res, x0 - -// Copy the prime p_521 = 2^521 - 1 into the f variable - - mov x10, #0xFFFFFFFFFFFFFFFF - stp x10, x10, [f] - stp x10, x10, [f+16] - stp x10, x10, [f+32] - stp x10, x10, [f+48] - mov x11, #0x1FF - str x11, [f+64] - -// Copy the input into the g variable, but reduce it strictly mod p_521 -// so that g <= f as assumed in the bound proof. This code fragment is -// very similar to bignum_mod_p521_9 complete with carry condensation. - - ldr x8, [x1, #64] - lsr x9, x8, #9 - - subs xzr, xzr, xzr - ldp x10, x11, [x1] - adcs xzr, x10, x9 - adcs xzr, x11, xzr - ldp x12, x13, [x1, #16] - and x7, x12, x13 - adcs xzr, x7, xzr - ldp x14, x15, [x1, #32] - and x7, x14, x15 - adcs xzr, x7, xzr - ldp x16, x17, [x1, #48] - and x7, x16, x17 - adcs xzr, x7, xzr - orr x7, x8, #~0x1FF - adcs x7, x7, xzr - - adcs x10, x10, x9 - adcs x11, x11, xzr - adcs x12, x12, xzr - adcs x13, x13, xzr - adcs x14, x14, xzr - adcs x15, x15, xzr - adcs x16, x16, xzr - adcs x17, x17, xzr - adc x8, x8, xzr - and x8, x8, #0x1FF - - stp x10, x11, [g] - stp x12, x13, [g+16] - stp x14, x15, [g+32] - stp x16, x17, [g+48] - str x8, [g+64] - -// Also maintain weakly reduced < 2*p_521 vector [u,v] such that -// [f,g] == x * 2^{1239-59*i} * [u,v] (mod p_521) -// starting with [p_521,x] == x * 2^{1239-59*0} * [0,2^-1239] (mod p_521) -// Note that because (2^{a+521} == 2^a) (mod p_521) we simply have -// (2^-1239 == 2^324) (mod p_521) so the constant initializer is simple. -// -// Based on the standard divstep bound, for inputs <= 2^b we need at least -// n >= (9437 * b + 1) / 4096. Since b is 521, that means 1201 iterations. -// Since we package divstep in multiples of 59 bits, we do 21 blocks of 59 -// making *1239* total. (With a bit more effort we could avoid the full 59 -// divsteps and use a shorter tail computation, but we keep it simple.) -// Hence, after the 21st iteration we have [f,g] == x * [u,v] and since -// |f| = 1 we get the modular inverse from u by flipping its sign with f. - - stp xzr, xzr, [u] - stp xzr, xzr, [u+16] - stp xzr, xzr, [u+32] - stp xzr, xzr, [u+48] - str xzr, [u+64] - - mov x10, #16 - stp xzr, xzr, [v] - stp xzr, xzr, [v+16] - stp xzr, x10, [v+32] - stp xzr, xzr, [v+48] - str xzr, [v+64] - -// Start of main loop. We jump into the middle so that the divstep -// portion is common to the special 21st iteration after a uniform -// first 20. - - mov i, #21 - mov d, #1 - b bignum_inv_p521_midloop - -bignum_inv_p521_loop: - -// Separate the matrix elements into sign-magnitude pairs - - cmp m00, xzr - csetm s00, mi - cneg m00, m00, mi - - cmp m01, xzr - csetm s01, mi - cneg m01, m01, mi - - cmp m10, xzr - csetm s10, mi - cneg m10, m10, mi - - cmp m11, xzr - csetm s11, mi - cneg m11, m11, mi - -// Adjust the initial values to allow for complement instead of negation -// This initial offset is the same for [f,g] and [u,v] compositions. -// Save it in stable registers for the [u,v] part and do [f,g] first. - - and x0, m00, s00 - and x1, m01, s01 - add car0, x0, x1 - - and x0, m10, s10 - and x1, m11, s11 - add car1, x0, x1 - -// Now the computation of the updated f and g values. This maintains a -// 2-word carry between stages so we can conveniently insert the shift -// right by 59 before storing back, and not overwrite digits we need -// again of the old f and g values. -// -// Digit 0 of [f,g] - - ldr x7, [f] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, car0, x0 - adc x2, xzr, x1 - ldr x8, [g] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - adc x2, x2, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, car1, x0 - adc x3, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - adc x3, x3, x1 - -// Digit 1 of [f,g] - - ldr x7, [f+N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [g+N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - adc x6, x6, x1 - extr x4, x2, x4, #59 - str x4, [f] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x3, x3, x0 - adc x4, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x3, x3, x0 - adc x4, x4, x1 - extr x5, x3, x5, #59 - str x5, [g] - -// Digit 2 of [f,g] - - ldr x7, [f+2*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [g+2*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - adc x5, x5, x1 - extr x2, x6, x2, #59 - str x2, [f+N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x4, x4, x0 - adc x2, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x4, x4, x0 - adc x2, x2, x1 - extr x3, x4, x3, #59 - str x3, [g+N] - -// Digit 3 of [f,g] - - ldr x7, [f+3*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, xzr, x1 - ldr x8, [g+3*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - adc x3, x3, x1 - extr x6, x5, x6, #59 - str x6, [f+2*N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x2, x2, x0 - adc x6, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x2, x2, x0 - adc x6, x6, x1 - extr x4, x2, x4, #59 - str x4, [g+2*N] - -// Digit 4 of [f,g] - - ldr x7, [f+4*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x3, x3, x0 - adc x4, xzr, x1 - ldr x8, [g+4*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x3, x3, x0 - adc x4, x4, x1 - extr x5, x3, x5, #59 - str x5, [f+3*N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x6, x6, x0 - adc x5, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x6, x6, x0 - adc x5, x5, x1 - extr x2, x6, x2, #59 - str x2, [g+3*N] - -// Digit 5 of [f,g] - - ldr x7, [f+5*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, x4, x0 - adc x2, xzr, x1 - ldr x8, [g+5*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - adc x2, x2, x1 - extr x3, x4, x3, #59 - str x3, [f+4*N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, x5, x0 - adc x3, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - adc x3, x3, x1 - extr x6, x5, x6, #59 - str x6, [g+4*N] - -// Digit 6 of [f,g] - - ldr x7, [f+6*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [g+6*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - adc x6, x6, x1 - extr x4, x2, x4, #59 - str x4, [f+5*N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x3, x3, x0 - adc x4, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x3, x3, x0 - adc x4, x4, x1 - extr x5, x3, x5, #59 - str x5, [g+5*N] - -// Digit 7 of [f,g] - - ldr x7, [f+7*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [g+7*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - adc x5, x5, x1 - extr x2, x6, x2, #59 - str x2, [f+6*N] - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x4, x4, x0 - adc x2, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x4, x4, x0 - adc x2, x2, x1 - extr x3, x4, x3, #59 - str x3, [g+6*N] - -// Digits 8 and 9 of [f,g] - - ldr x7, [f+8*N] - eor x1, x7, s00 - asr x3, x1, #63 - and x3, x3, m00 - neg x3, x3 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, x3, x1 - ldr x8, [g+8*N] - eor x1, x8, s01 - asr x0, x1, #63 - and x0, x0, m01 - sub x3, x3, x0 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - adc x3, x3, x1 - extr x6, x5, x6, #59 - str x6, [f+7*N] - extr x5, x3, x5, #59 - str x5, [f+8*N] - - eor x1, x7, s10 - asr x5, x1, #63 - and x5, x5, m10 - neg x5, x5 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x2, x2, x0 - adc x5, x5, x1 - eor x1, x8, s11 - asr x0, x1, #63 - and x0, x0, m11 - sub x5, x5, x0 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x2, x2, x0 - adc x5, x5, x1 - extr x4, x2, x4, #59 - str x4, [g+7*N] - extr x2, x5, x2, #59 - str x2, [g+8*N] - -// Now the computation of the updated u and v values and their -// modular reductions. A very similar accumulation except that -// the top words of u and v are unsigned and we don't shift. -// -// Digit 0 of [u,v] - - ldr x7, [u] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, car0, x0 - adc x2, xzr, x1 - ldr x8, [v] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u] - adc x2, x2, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, car1, x0 - adc x3, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - str x5, [v] - adc x3, x3, x1 - -// Digit 1 of [u,v] - - ldr x7, [u+N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [v+N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - str x2, [u+N] - adc x6, x6, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x3, x3, x0 - adc x4, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x3, x3, x0 - str x3, [v+N] - adc x4, x4, x1 - -// Digit 2 of [u,v] - - ldr x7, [u+2*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [v+2*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - str x6, [u+2*N] - adc x5, x5, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x4, x4, x0 - adc x2, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x4, x4, x0 - str x4, [v+2*N] - adc x2, x2, x1 - -// Digit 3 of [u,v] - - ldr x7, [u+3*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, xzr, x1 - ldr x8, [v+3*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - str x5, [u+3*N] - adc x3, x3, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x2, x2, x0 - adc x6, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x2, x2, x0 - str x2, [v+3*N] - adc x6, x6, x1 - -// Digit 4 of [u,v] - - ldr x7, [u+4*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x3, x3, x0 - adc x4, xzr, x1 - ldr x8, [v+4*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x3, x3, x0 - str x3, [u+4*N] - adc x4, x4, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x6, x6, x0 - adc x5, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x6, x6, x0 - str x6, [v+4*N] - adc x5, x5, x1 - -// Digit 5 of [u,v] - - ldr x7, [u+5*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, x4, x0 - adc x2, xzr, x1 - ldr x8, [v+5*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u+5*N] - adc x2, x2, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x5, x5, x0 - adc x3, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x5, x5, x0 - str x5, [v+5*N] - adc x3, x3, x1 - -// Digit 6 of [u,v] - - ldr x7, [u+6*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [v+6*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - str x2, [u+6*N] - adc x6, x6, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x3, x3, x0 - adc x4, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x3, x3, x0 - str x3, [v+6*N] - adc x4, x4, x1 - -// Digit 7 of [u,v] - - ldr x7, [u+7*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [v+7*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - str x6, [u+7*N] - adc x5, x5, x1 - - eor x1, x7, s10 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x4, x4, x0 - adc x2, xzr, x1 - eor x1, x8, s11 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x4, x4, x0 - str x4, [v+7*N] - adc x2, x2, x1 - -// Digits 8 and 9 of u (top is unsigned) - - ldr x7, [u+8*N] - eor x1, x7, s00 - and x3, s00, m00 - neg x3, x3 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, x3, x1 - ldr x8, [v+8*N] - eor x1, x8, s01 - and x0, s01, m01 - sub x3, x3, x0 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - adc x3, x3, x1 - -// Modular reduction of u, reloading as needed from u[0],...,u[7],x5,x3 - - extr x6, x3, x5, #9 - ldp x0, x1, [u] - add x6, x6, x3, asr #63 - sub x5, x5, x6, lsl #9 - adds x0, x0, x6 - asr x6, x6, #63 - adcs x1, x1, x6 - stp x0, x1, [u] - ldp x0, x1, [u+16] - adcs x0, x0, x6 - adcs x1, x1, x6 - stp x0, x1, [u+16] - ldp x0, x1, [u+32] - adcs x0, x0, x6 - adcs x1, x1, x6 - stp x0, x1, [u+32] - ldp x0, x1, [u+48] - adcs x0, x0, x6 - adcs x1, x1, x6 - stp x0, x1, [u+48] - adc x5, x5, x6 - str x5, [u+64] - -// Digits 8 and 9 of v (top is unsigned) - - eor x1, x7, s10 - and x5, s10, m10 - neg x5, x5 - mul x0, x1, m10 - umulh x1, x1, m10 - adds x2, x2, x0 - adc x5, x5, x1 - eor x1, x8, s11 - and x0, s11, m11 - sub x5, x5, x0 - mul x0, x1, m11 - umulh x1, x1, m11 - adds x2, x2, x0 - adc x5, x5, x1 - -// Modular reduction of v, reloading as needed from v[0],...,v[7],x2,x5 - - extr x6, x5, x2, #9 - ldp x0, x1, [v] - add x6, x6, x5, asr #63 - sub x2, x2, x6, lsl #9 - adds x0, x0, x6 - asr x6, x6, #63 - adcs x1, x1, x6 - stp x0, x1, [v] - ldp x0, x1, [v+16] - adcs x0, x0, x6 - adcs x1, x1, x6 - stp x0, x1, [v+16] - ldp x0, x1, [v+32] - adcs x0, x0, x6 - adcs x1, x1, x6 - stp x0, x1, [v+32] - ldp x0, x1, [v+48] - adcs x0, x0, x6 - adcs x1, x1, x6 - stp x0, x1, [v+48] - adc x2, x2, x6 - str x2, [v+64] - -bignum_inv_p521_midloop: - - mov x1, d - ldr x2, [f] - ldr x3, [g] - divstep59() - mov d, x1 - -// Next iteration - - subs i, i, #1 - bne bignum_inv_p521_loop - -// The 21st and last iteration does not need anything except the -// u value and the sign of f; the latter can be obtained from the -// lowest word of f. So it's done differently from the main loop. -// Find the sign of the new f. For this we just need one digit -// since we know (for in-scope cases) that f is either +1 or -1. -// We don't explicitly shift right by 59 either, but looking at -// bit 63 (or any bit >= 60) of the unshifted result is enough -// to distinguish -1 from +1; this is then made into a mask. - - ldr x0, [f] - ldr x1, [g] - mul x0, x0, m00 - madd x1, x1, m01, x0 - asr x0, x1, #63 - -// Now separate out the matrix into sign-magnitude pairs -// and adjust each one based on the sign of f. -// -// Note that at this point we expect |f|=1 and we got its -// sign above, so then since [f,0] == x * [u,v] (mod p_521) -// we want to flip the sign of u according to that of f. - - cmp m00, xzr - csetm s00, mi - cneg m00, m00, mi - eor s00, s00, x0 - - cmp m01, xzr - csetm s01, mi - cneg m01, m01, mi - eor s01, s01, x0 - - cmp m10, xzr - csetm s10, mi - cneg m10, m10, mi - eor s10, s10, x0 - - cmp m11, xzr - csetm s11, mi - cneg m11, m11, mi - eor s11, s11, x0 - -// Adjust the initial value to allow for complement instead of negation - - and x0, m00, s00 - and x1, m01, s01 - add car0, x0, x1 - -// Digit 0 of [u] - - ldr x7, [u] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, car0, x0 - adc x2, xzr, x1 - ldr x8, [v] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u] - adc x2, x2, x1 - -// Digit 1 of [u] - - ldr x7, [u+N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [v+N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - str x2, [u+N] - adc x6, x6, x1 - -// Digit 2 of [u] - - ldr x7, [u+2*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [v+2*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - str x6, [u+2*N] - adc x5, x5, x1 - -// Digit 3 of [u] - - ldr x7, [u+3*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, xzr, x1 - ldr x8, [v+3*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - str x5, [u+3*N] - adc x3, x3, x1 - -// Digit 4 of [u] - - ldr x7, [u+4*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x3, x3, x0 - adc x4, xzr, x1 - ldr x8, [v+4*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x3, x3, x0 - str x3, [u+4*N] - adc x4, x4, x1 - -// Digit 5 of [u] - - ldr x7, [u+5*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x4, x4, x0 - adc x2, xzr, x1 - ldr x8, [v+5*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x4, x4, x0 - str x4, [u+5*N] - adc x2, x2, x1 - -// Digit 6 of [u] - - ldr x7, [u+6*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x2, x2, x0 - adc x6, xzr, x1 - ldr x8, [v+6*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x2, x2, x0 - str x2, [u+6*N] - adc x6, x6, x1 - -// Digit 7 of [u] - - ldr x7, [u+7*N] - eor x1, x7, s00 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x6, x6, x0 - adc x5, xzr, x1 - ldr x8, [v+7*N] - eor x1, x8, s01 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x6, x6, x0 - str x6, [u+7*N] - adc x5, x5, x1 - -// Digits 8 and 9 of u (top is unsigned) - - ldr x7, [u+8*N] - eor x1, x7, s00 - and x3, s00, m00 - neg x3, x3 - mul x0, x1, m00 - umulh x1, x1, m00 - adds x5, x5, x0 - adc x3, x3, x1 - ldr x8, [v+8*N] - eor x1, x8, s01 - and x0, s01, m01 - sub x3, x3, x0 - mul x0, x1, m01 - umulh x1, x1, m01 - adds x5, x5, x0 - adc x3, x3, x1 - -// Modular reduction of u, reloading as needed from u[0],...,u[7],x5,x3 - - extr x6, x3, x5, #9 - ldp x10, x11, [u] - add x6, x6, x3, asr #63 - sub x5, x5, x6, lsl #9 - adds x10, x10, x6 - asr x6, x6, #63 - adcs x11, x11, x6 - ldp x12, x13, [u+16] - adcs x12, x12, x6 - adcs x13, x13, x6 - ldp x14, x15, [u+32] - adcs x14, x14, x6 - adcs x15, x15, x6 - ldp x16, x17, [u+48] - adcs x16, x16, x6 - adcs x17, x17, x6 - adc x19, x5, x6 - -// Further strict reduction ready for the output, which just means -// a conditional subtraction of p_521 - - subs x0, x10, #-1 - adcs x1, x11, xzr - adcs x2, x12, xzr - adcs x3, x13, xzr - adcs x4, x14, xzr - adcs x5, x15, xzr - adcs x6, x16, xzr - adcs x7, x17, xzr - mov x8, #0x1FF - sbcs x8, x19, x8 - - csel x0, x0, x10, cs - csel x1, x1, x11, cs - csel x2, x2, x12, cs - csel x3, x3, x13, cs - csel x4, x4, x14, cs - csel x5, x5, x15, cs - csel x6, x6, x16, cs - csel x7, x7, x17, cs - csel x8, x8, x19, cs - -// Store it back to the final output - - stp x0, x1, [res] - stp x2, x3, [res, #16] - stp x4, x5, [res, #32] - stp x6, x7, [res, #48] - str x8, [res, #64] - -// Restore stack and registers - - add sp, sp, NSPACE - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits -#endif diff --git a/third_party/s2n-bignum/arm/p521/p521_jadd_alt.S b/third_party/s2n-bignum/arm/p521/p521_jadd_alt.S deleted file mode 100644 index 72c9239be29..00000000000 --- a/third_party/s2n-bignum/arm/p521/p521_jadd_alt.S +++ /dev/null @@ -1,979 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Point addition on NIST curve P-521 in Jacobian coordinates -// -// extern void p521_jadd_alt -// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]); -// -// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. -// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). -// It is assumed that all coordinates of the input points p1 and p2 are -// fully reduced mod p_521, that both z coordinates are nonzero and -// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents -// the same affine point as". -// -// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jadd_alt) - S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jadd_alt) - - .text - .balign 4 - -// Size of individual field elements - -#define NUMSIZE 72 - -// Stable homes for input arguments during main code sequence - -#define input_z x26 -#define input_x x27 -#define input_y x28 - -// Pointer-offset pairs for inputs and outputs - -#define x_1 input_x, #0 -#define y_1 input_x, #NUMSIZE -#define z_1 input_x, #(2*NUMSIZE) - -#define x_2 input_y, #0 -#define y_2 input_y, #NUMSIZE -#define z_2 input_y, #(2*NUMSIZE) - -#define x_3 input_z, #0 -#define y_3 input_z, #NUMSIZE -#define z_3 input_z, #(2*NUMSIZE) - -// Pointer-offset pairs for temporaries, with some aliasing -// NSPACE is the total stack needed for these temporaries - -#define z1sq sp, #(NUMSIZE*0) -#define ww sp, #(NUMSIZE*0) -#define resx sp, #(NUMSIZE*0) - -#define yd sp, #(NUMSIZE*1) -#define y2a sp, #(NUMSIZE*1) - -#define x2a sp, #(NUMSIZE*2) -#define zzx2 sp, #(NUMSIZE*2) - -#define zz sp, #(NUMSIZE*3) -#define t1 sp, #(NUMSIZE*3) - -#define t2 sp, #(NUMSIZE*4) -#define x1a sp, #(NUMSIZE*4) -#define zzx1 sp, #(NUMSIZE*4) -#define resy sp, #(NUMSIZE*4) - -#define xd sp, #(NUMSIZE*5) -#define z2sq sp, #(NUMSIZE*5) -#define resz sp, #(NUMSIZE*5) - -#define y1a sp, #(NUMSIZE*6) - -// NUMSIZE*7 is not 16-aligned so we round it up - -#define NSPACE (NUMSIZE*7+8) - -// Corresponds exactly to bignum_mul_p521_alt - -#define mul_p521(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x15, x3, x5; \ - umulh x16, x3, x5; \ - mul x14, x3, x6; \ - umulh x17, x3, x6; \ - adds x16, x16, x14; \ - ldp x7, x8, [P2+16]; \ - mul x14, x3, x7; \ - umulh x19, x3, x7; \ - adcs x17, x17, x14; \ - mul x14, x3, x8; \ - umulh x20, x3, x8; \ - adcs x19, x19, x14; \ - ldp x9, x10, [P2+32]; \ - mul x14, x3, x9; \ - umulh x21, x3, x9; \ - adcs x20, x20, x14; \ - mul x14, x3, x10; \ - umulh x22, x3, x10; \ - adcs x21, x21, x14; \ - ldp x11, x12, [P2+48]; \ - mul x14, x3, x11; \ - umulh x23, x3, x11; \ - adcs x22, x22, x14; \ - ldr x13, [P2+64]; \ - mul x14, x3, x12; \ - umulh x24, x3, x12; \ - adcs x23, x23, x14; \ - mul x14, x3, x13; \ - umulh x1, x3, x13; \ - adcs x24, x24, x14; \ - adc x1, x1, xzr; \ - mul x14, x4, x5; \ - adds x16, x16, x14; \ - mul x14, x4, x6; \ - adcs x17, x17, x14; \ - mul x14, x4, x7; \ - adcs x19, x19, x14; \ - mul x14, x4, x8; \ - adcs x20, x20, x14; \ - mul x14, x4, x9; \ - adcs x21, x21, x14; \ - mul x14, x4, x10; \ - adcs x22, x22, x14; \ - mul x14, x4, x11; \ - adcs x23, x23, x14; \ - mul x14, x4, x12; \ - adcs x24, x24, x14; \ - mul x14, x4, x13; \ - adcs x1, x1, x14; \ - cset x0, hs; \ - umulh x14, x4, x5; \ - adds x17, x17, x14; \ - umulh x14, x4, x6; \ - adcs x19, x19, x14; \ - umulh x14, x4, x7; \ - adcs x20, x20, x14; \ - umulh x14, x4, x8; \ - adcs x21, x21, x14; \ - umulh x14, x4, x9; \ - adcs x22, x22, x14; \ - umulh x14, x4, x10; \ - adcs x23, x23, x14; \ - umulh x14, x4, x11; \ - adcs x24, x24, x14; \ - umulh x14, x4, x12; \ - adcs x1, x1, x14; \ - umulh x14, x4, x13; \ - adc x0, x0, x14; \ - stp x15, x16, [P0]; \ - ldp x3, x4, [P1+16]; \ - mul x14, x3, x5; \ - adds x17, x17, x14; \ - mul x14, x3, x6; \ - adcs x19, x19, x14; \ - mul x14, x3, x7; \ - adcs x20, x20, x14; \ - mul x14, x3, x8; \ - adcs x21, x21, x14; \ - mul x14, x3, x9; \ - adcs x22, x22, x14; \ - mul x14, x3, x10; \ - adcs x23, x23, x14; \ - mul x14, x3, x11; \ - adcs x24, x24, x14; \ - mul x14, x3, x12; \ - adcs x1, x1, x14; \ - mul x14, x3, x13; \ - adcs x0, x0, x14; \ - cset x15, hs; \ - umulh x14, x3, x5; \ - adds x19, x19, x14; \ - umulh x14, x3, x6; \ - adcs x20, x20, x14; \ - umulh x14, x3, x7; \ - adcs x21, x21, x14; \ - umulh x14, x3, x8; \ - adcs x22, x22, x14; \ - umulh x14, x3, x9; \ - adcs x23, x23, x14; \ - umulh x14, x3, x10; \ - adcs x24, x24, x14; \ - umulh x14, x3, x11; \ - adcs x1, x1, x14; \ - umulh x14, x3, x12; \ - adcs x0, x0, x14; \ - umulh x14, x3, x13; \ - adc x15, x15, x14; \ - mul x14, x4, x5; \ - adds x19, x19, x14; \ - mul x14, x4, x6; \ - adcs x20, x20, x14; \ - mul x14, x4, x7; \ - adcs x21, x21, x14; \ - mul x14, x4, x8; \ - adcs x22, x22, x14; \ - mul x14, x4, x9; \ - adcs x23, x23, x14; \ - mul x14, x4, x10; \ - adcs x24, x24, x14; \ - mul x14, x4, x11; \ - adcs x1, x1, x14; \ - mul x14, x4, x12; \ - adcs x0, x0, x14; \ - mul x14, x4, x13; \ - adcs x15, x15, x14; \ - cset x16, hs; \ - umulh x14, x4, x5; \ - adds x20, x20, x14; \ - umulh x14, x4, x6; \ - adcs x21, x21, x14; \ - umulh x14, x4, x7; \ - adcs x22, x22, x14; \ - umulh x14, x4, x8; \ - adcs x23, x23, x14; \ - umulh x14, x4, x9; \ - adcs x24, x24, x14; \ - umulh x14, x4, x10; \ - adcs x1, x1, x14; \ - umulh x14, x4, x11; \ - adcs x0, x0, x14; \ - umulh x14, x4, x12; \ - adcs x15, x15, x14; \ - umulh x14, x4, x13; \ - adc x16, x16, x14; \ - stp x17, x19, [P0+16]; \ - ldp x3, x4, [P1+32]; \ - mul x14, x3, x5; \ - adds x20, x20, x14; \ - mul x14, x3, x6; \ - adcs x21, x21, x14; \ - mul x14, x3, x7; \ - adcs x22, x22, x14; \ - mul x14, x3, x8; \ - adcs x23, x23, x14; \ - mul x14, x3, x9; \ - adcs x24, x24, x14; \ - mul x14, x3, x10; \ - adcs x1, x1, x14; \ - mul x14, x3, x11; \ - adcs x0, x0, x14; \ - mul x14, x3, x12; \ - adcs x15, x15, x14; \ - mul x14, x3, x13; \ - adcs x16, x16, x14; \ - cset x17, hs; \ - umulh x14, x3, x5; \ - adds x21, x21, x14; \ - umulh x14, x3, x6; \ - adcs x22, x22, x14; \ - umulh x14, x3, x7; \ - adcs x23, x23, x14; \ - umulh x14, x3, x8; \ - adcs x24, x24, x14; \ - umulh x14, x3, x9; \ - adcs x1, x1, x14; \ - umulh x14, x3, x10; \ - adcs x0, x0, x14; \ - umulh x14, x3, x11; \ - adcs x15, x15, x14; \ - umulh x14, x3, x12; \ - adcs x16, x16, x14; \ - umulh x14, x3, x13; \ - adc x17, x17, x14; \ - mul x14, x4, x5; \ - adds x21, x21, x14; \ - mul x14, x4, x6; \ - adcs x22, x22, x14; \ - mul x14, x4, x7; \ - adcs x23, x23, x14; \ - mul x14, x4, x8; \ - adcs x24, x24, x14; \ - mul x14, x4, x9; \ - adcs x1, x1, x14; \ - mul x14, x4, x10; \ - adcs x0, x0, x14; \ - mul x14, x4, x11; \ - adcs x15, x15, x14; \ - mul x14, x4, x12; \ - adcs x16, x16, x14; \ - mul x14, x4, x13; \ - adcs x17, x17, x14; \ - cset x19, hs; \ - umulh x14, x4, x5; \ - adds x22, x22, x14; \ - umulh x14, x4, x6; \ - adcs x23, x23, x14; \ - umulh x14, x4, x7; \ - adcs x24, x24, x14; \ - umulh x14, x4, x8; \ - adcs x1, x1, x14; \ - umulh x14, x4, x9; \ - adcs x0, x0, x14; \ - umulh x14, x4, x10; \ - adcs x15, x15, x14; \ - umulh x14, x4, x11; \ - adcs x16, x16, x14; \ - umulh x14, x4, x12; \ - adcs x17, x17, x14; \ - umulh x14, x4, x13; \ - adc x19, x19, x14; \ - stp x20, x21, [P0+32]; \ - ldp x3, x4, [P1+48]; \ - mul x14, x3, x5; \ - adds x22, x22, x14; \ - mul x14, x3, x6; \ - adcs x23, x23, x14; \ - mul x14, x3, x7; \ - adcs x24, x24, x14; \ - mul x14, x3, x8; \ - adcs x1, x1, x14; \ - mul x14, x3, x9; \ - adcs x0, x0, x14; \ - mul x14, x3, x10; \ - adcs x15, x15, x14; \ - mul x14, x3, x11; \ - adcs x16, x16, x14; \ - mul x14, x3, x12; \ - adcs x17, x17, x14; \ - mul x14, x3, x13; \ - adcs x19, x19, x14; \ - cset x20, hs; \ - umulh x14, x3, x5; \ - adds x23, x23, x14; \ - umulh x14, x3, x6; \ - adcs x24, x24, x14; \ - umulh x14, x3, x7; \ - adcs x1, x1, x14; \ - umulh x14, x3, x8; \ - adcs x0, x0, x14; \ - umulh x14, x3, x9; \ - adcs x15, x15, x14; \ - umulh x14, x3, x10; \ - adcs x16, x16, x14; \ - umulh x14, x3, x11; \ - adcs x17, x17, x14; \ - umulh x14, x3, x12; \ - adcs x19, x19, x14; \ - umulh x14, x3, x13; \ - adc x20, x20, x14; \ - mul x14, x4, x5; \ - adds x23, x23, x14; \ - mul x14, x4, x6; \ - adcs x24, x24, x14; \ - mul x14, x4, x7; \ - adcs x1, x1, x14; \ - mul x14, x4, x8; \ - adcs x0, x0, x14; \ - mul x14, x4, x9; \ - adcs x15, x15, x14; \ - mul x14, x4, x10; \ - adcs x16, x16, x14; \ - mul x14, x4, x11; \ - adcs x17, x17, x14; \ - mul x14, x4, x12; \ - adcs x19, x19, x14; \ - mul x14, x4, x13; \ - adcs x20, x20, x14; \ - cset x21, hs; \ - umulh x14, x4, x5; \ - adds x24, x24, x14; \ - umulh x14, x4, x6; \ - adcs x1, x1, x14; \ - umulh x14, x4, x7; \ - adcs x0, x0, x14; \ - umulh x14, x4, x8; \ - adcs x15, x15, x14; \ - umulh x14, x4, x9; \ - adcs x16, x16, x14; \ - umulh x14, x4, x10; \ - adcs x17, x17, x14; \ - umulh x14, x4, x11; \ - adcs x19, x19, x14; \ - umulh x14, x4, x12; \ - adcs x20, x20, x14; \ - umulh x14, x4, x13; \ - adc x21, x21, x14; \ - stp x22, x23, [P0+48]; \ - ldr x3, [P1+64]; \ - mul x14, x3, x5; \ - adds x24, x24, x14; \ - mul x14, x3, x6; \ - adcs x1, x1, x14; \ - mul x14, x3, x7; \ - adcs x0, x0, x14; \ - mul x14, x3, x8; \ - adcs x15, x15, x14; \ - mul x14, x3, x9; \ - adcs x16, x16, x14; \ - mul x14, x3, x10; \ - adcs x17, x17, x14; \ - mul x14, x3, x11; \ - adcs x19, x19, x14; \ - mul x14, x3, x12; \ - adcs x20, x20, x14; \ - mul x14, x3, x13; \ - adc x21, x21, x14; \ - umulh x14, x3, x5; \ - adds x1, x1, x14; \ - umulh x14, x3, x6; \ - adcs x0, x0, x14; \ - umulh x14, x3, x7; \ - adcs x15, x15, x14; \ - umulh x14, x3, x8; \ - adcs x16, x16, x14; \ - umulh x14, x3, x9; \ - adcs x17, x17, x14; \ - umulh x14, x3, x10; \ - adcs x19, x19, x14; \ - umulh x14, x3, x11; \ - adcs x20, x20, x14; \ - umulh x14, x3, x12; \ - adc x21, x21, x14; \ - cmp xzr, xzr; \ - ldp x5, x6, [P0]; \ - extr x14, x1, x24, #9; \ - adcs x5, x5, x14; \ - extr x14, x0, x1, #9; \ - adcs x6, x6, x14; \ - ldp x7, x8, [P0+16]; \ - extr x14, x15, x0, #9; \ - adcs x7, x7, x14; \ - extr x14, x16, x15, #9; \ - adcs x8, x8, x14; \ - ldp x9, x10, [P0+32]; \ - extr x14, x17, x16, #9; \ - adcs x9, x9, x14; \ - extr x14, x19, x17, #9; \ - adcs x10, x10, x14; \ - ldp x11, x12, [P0+48]; \ - extr x14, x20, x19, #9; \ - adcs x11, x11, x14; \ - extr x14, x21, x20, #9; \ - adcs x12, x12, x14; \ - orr x13, x24, #0xfffffffffffffe00; \ - lsr x14, x21, #9; \ - adcs x13, x13, x14; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - -// Corresponds exactly to bignum_sqr_p521_alt - -#define sqr_p521(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x11, x2, x3; \ - umulh x12, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x10, x2, x4; \ - umulh x13, x2, x4; \ - adds x12, x12, x10; \ - ldp x6, x7, [P1+32]; \ - mul x10, x2, x5; \ - umulh x14, x2, x5; \ - adcs x13, x13, x10; \ - ldp x8, x9, [P1+48]; \ - mul x10, x2, x6; \ - umulh x15, x2, x6; \ - adcs x14, x14, x10; \ - mul x10, x2, x7; \ - umulh x16, x2, x7; \ - adcs x15, x15, x10; \ - mul x10, x2, x8; \ - umulh x17, x2, x8; \ - adcs x16, x16, x10; \ - mul x10, x2, x9; \ - umulh x19, x2, x9; \ - adcs x17, x17, x10; \ - adc x19, x19, xzr; \ - mul x10, x3, x4; \ - adds x13, x13, x10; \ - mul x10, x3, x5; \ - adcs x14, x14, x10; \ - mul x10, x3, x6; \ - adcs x15, x15, x10; \ - mul x10, x3, x7; \ - adcs x16, x16, x10; \ - mul x10, x3, x8; \ - adcs x17, x17, x10; \ - mul x10, x3, x9; \ - adcs x19, x19, x10; \ - cset x20, hs; \ - umulh x10, x3, x4; \ - adds x14, x14, x10; \ - umulh x10, x3, x5; \ - adcs x15, x15, x10; \ - umulh x10, x3, x6; \ - adcs x16, x16, x10; \ - umulh x10, x3, x7; \ - adcs x17, x17, x10; \ - umulh x10, x3, x8; \ - adcs x19, x19, x10; \ - umulh x10, x3, x9; \ - adc x20, x20, x10; \ - mul x10, x6, x7; \ - umulh x21, x6, x7; \ - adds x20, x20, x10; \ - adc x21, x21, xzr; \ - mul x10, x4, x5; \ - adds x15, x15, x10; \ - mul x10, x4, x6; \ - adcs x16, x16, x10; \ - mul x10, x4, x7; \ - adcs x17, x17, x10; \ - mul x10, x4, x8; \ - adcs x19, x19, x10; \ - mul x10, x4, x9; \ - adcs x20, x20, x10; \ - mul x10, x6, x8; \ - adcs x21, x21, x10; \ - cset x22, hs; \ - umulh x10, x4, x5; \ - adds x16, x16, x10; \ - umulh x10, x4, x6; \ - adcs x17, x17, x10; \ - umulh x10, x4, x7; \ - adcs x19, x19, x10; \ - umulh x10, x4, x8; \ - adcs x20, x20, x10; \ - umulh x10, x4, x9; \ - adcs x21, x21, x10; \ - umulh x10, x6, x8; \ - adc x22, x22, x10; \ - mul x10, x7, x8; \ - umulh x23, x7, x8; \ - adds x22, x22, x10; \ - adc x23, x23, xzr; \ - mul x10, x5, x6; \ - adds x17, x17, x10; \ - mul x10, x5, x7; \ - adcs x19, x19, x10; \ - mul x10, x5, x8; \ - adcs x20, x20, x10; \ - mul x10, x5, x9; \ - adcs x21, x21, x10; \ - mul x10, x6, x9; \ - adcs x22, x22, x10; \ - mul x10, x7, x9; \ - adcs x23, x23, x10; \ - cset x24, hs; \ - umulh x10, x5, x6; \ - adds x19, x19, x10; \ - umulh x10, x5, x7; \ - adcs x20, x20, x10; \ - umulh x10, x5, x8; \ - adcs x21, x21, x10; \ - umulh x10, x5, x9; \ - adcs x22, x22, x10; \ - umulh x10, x6, x9; \ - adcs x23, x23, x10; \ - umulh x10, x7, x9; \ - adc x24, x24, x10; \ - mul x10, x8, x9; \ - umulh x25, x8, x9; \ - adds x24, x24, x10; \ - adc x25, x25, xzr; \ - adds x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - adcs x20, x20, x20; \ - adcs x21, x21, x21; \ - adcs x22, x22, x22; \ - adcs x23, x23, x23; \ - adcs x24, x24, x24; \ - adcs x25, x25, x25; \ - cset x0, hs; \ - umulh x10, x2, x2; \ - adds x11, x11, x10; \ - mul x10, x3, x3; \ - adcs x12, x12, x10; \ - umulh x10, x3, x3; \ - adcs x13, x13, x10; \ - mul x10, x4, x4; \ - adcs x14, x14, x10; \ - umulh x10, x4, x4; \ - adcs x15, x15, x10; \ - mul x10, x5, x5; \ - adcs x16, x16, x10; \ - umulh x10, x5, x5; \ - adcs x17, x17, x10; \ - mul x10, x6, x6; \ - adcs x19, x19, x10; \ - umulh x10, x6, x6; \ - adcs x20, x20, x10; \ - mul x10, x7, x7; \ - adcs x21, x21, x10; \ - umulh x10, x7, x7; \ - adcs x22, x22, x10; \ - mul x10, x8, x8; \ - adcs x23, x23, x10; \ - umulh x10, x8, x8; \ - adcs x24, x24, x10; \ - mul x10, x9, x9; \ - adcs x25, x25, x10; \ - umulh x10, x9, x9; \ - adc x0, x0, x10; \ - ldr x1, [P1+64]; \ - add x1, x1, x1; \ - mul x10, x1, x2; \ - adds x19, x19, x10; \ - umulh x10, x1, x2; \ - adcs x20, x20, x10; \ - mul x10, x1, x4; \ - adcs x21, x21, x10; \ - umulh x10, x1, x4; \ - adcs x22, x22, x10; \ - mul x10, x1, x6; \ - adcs x23, x23, x10; \ - umulh x10, x1, x6; \ - adcs x24, x24, x10; \ - mul x10, x1, x8; \ - adcs x25, x25, x10; \ - umulh x10, x1, x8; \ - adcs x0, x0, x10; \ - lsr x4, x1, #1; \ - mul x4, x4, x4; \ - adc x4, x4, xzr; \ - mul x10, x1, x3; \ - adds x20, x20, x10; \ - umulh x10, x1, x3; \ - adcs x21, x21, x10; \ - mul x10, x1, x5; \ - adcs x22, x22, x10; \ - umulh x10, x1, x5; \ - adcs x23, x23, x10; \ - mul x10, x1, x7; \ - adcs x24, x24, x10; \ - umulh x10, x1, x7; \ - adcs x25, x25, x10; \ - mul x10, x1, x9; \ - adcs x0, x0, x10; \ - umulh x10, x1, x9; \ - adc x4, x4, x10; \ - mul x2, x2, x2; \ - cmp xzr, xzr; \ - extr x10, x20, x19, #9; \ - adcs x2, x2, x10; \ - extr x10, x21, x20, #9; \ - adcs x11, x11, x10; \ - extr x10, x22, x21, #9; \ - adcs x12, x12, x10; \ - extr x10, x23, x22, #9; \ - adcs x13, x13, x10; \ - extr x10, x24, x23, #9; \ - adcs x14, x14, x10; \ - extr x10, x25, x24, #9; \ - adcs x15, x15, x10; \ - extr x10, x0, x25, #9; \ - adcs x16, x16, x10; \ - extr x10, x4, x0, #9; \ - adcs x17, x17, x10; \ - orr x19, x19, #0xfffffffffffffe00; \ - lsr x10, x4, #9; \ - adcs x19, x19, x10; \ - sbcs x2, x2, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x19, x19, xzr; \ - and x19, x19, #0x1ff; \ - stp x2, x11, [P0]; \ - stp x12, x13, [P0+16]; \ - stp x14, x15, [P0+32]; \ - stp x16, x17, [P0+48]; \ - str x19, [P0+64] - -// Corresponds exactly to bignum_sub_p521 - -#define sub_p521(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - ldp x11, x12, [P1+48]; \ - ldp x4, x3, [P2+48]; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - ldr x13, [P1+64]; \ - ldr x4, [P2+64]; \ - sbcs x13, x13, x4; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - -S2N_BN_SYMBOL(p521_jadd_alt): - -// Save regs and make room on stack for temporary variables - - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - stp x25, x26, [sp, #-16]! - stp x27, x28, [sp, #-16]! - sub sp, sp, NSPACE - -// Move the input arguments to stable places - - mov input_z, x0 - mov input_x, x1 - mov input_y, x2 - -// Main code, just a sequence of basic field operations - - sqr_p521(z1sq,z_1) - sqr_p521(z2sq,z_2) - - mul_p521(y1a,z_2,y_1) - mul_p521(y2a,z_1,y_2) - - mul_p521(x2a,z1sq,x_2) - mul_p521(x1a,z2sq,x_1) - mul_p521(y2a,z1sq,y2a) - mul_p521(y1a,z2sq,y1a) - - sub_p521(xd,x2a,x1a) - sub_p521(yd,y2a,y1a) - - sqr_p521(zz,xd) - sqr_p521(ww,yd) - - mul_p521(zzx1,zz,x1a) - mul_p521(zzx2,zz,x2a) - - sub_p521(resx,ww,zzx1) - sub_p521(t1,zzx2,zzx1) - - mul_p521(xd,xd,z_1) - - sub_p521(resx,resx,zzx2) - - sub_p521(t2,zzx1,resx) - - mul_p521(t1,t1,y1a) - mul_p521(resz,xd,z_2) - mul_p521(t2,yd,t2) - - sub_p521(resy,t2,t1) - -// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 -// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) -// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) -// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 -// Multiplex the z outputs accordingly and re-store in resz - - ldp x0, x1, [z_1] - ldp x2, x3, [z_1+16] - ldp x4, x5, [z_1+32] - ldp x6, x7, [z_1+48] - ldr x8, [z_1+64] - - orr x20, x0, x1 - orr x21, x2, x3 - orr x22, x4, x5 - orr x23, x6, x7 - orr x20, x20, x21 - orr x22, x22, x23 - orr x20, x20, x8 - orr x20, x20, x22 - cmp x20, xzr - cset x20, ne - - ldp x10, x11, [z_2] - ldp x12, x13, [z_2+16] - ldp x14, x15, [z_2+32] - ldp x16, x17, [z_2+48] - ldr x19, [z_2+64] - - orr x21, x10, x11 - orr x22, x12, x13 - orr x23, x14, x15 - orr x24, x16, x17 - orr x21, x21, x22 - orr x23, x23, x24 - orr x21, x21, x19 - orr x21, x21, x23 - - csel x0, x0, x10, ne - csel x1, x1, x11, ne - csel x2, x2, x12, ne - csel x3, x3, x13, ne - csel x4, x4, x14, ne - csel x5, x5, x15, ne - csel x6, x6, x16, ne - csel x7, x7, x17, ne - csel x8, x8, x19, ne - - cmp x21, xzr - cset x21, ne - - cmp x21, x20 - - ldp x10, x11, [resz] - ldp x12, x13, [resz+16] - ldp x14, x15, [resz+32] - ldp x16, x17, [resz+48] - ldr x19, [resz+64] - - csel x0, x0, x10, ne - csel x1, x1, x11, ne - csel x2, x2, x12, ne - csel x3, x3, x13, ne - csel x4, x4, x14, ne - csel x5, x5, x15, ne - csel x6, x6, x16, ne - csel x7, x7, x17, ne - csel x8, x8, x19, ne - - stp x0, x1, [resz] - stp x2, x3, [resz+16] - stp x4, x5, [resz+32] - stp x6, x7, [resz+48] - str x8, [resz+64] - -// Multiplex the x and y outputs too, keeping the results in registers - - ldp x20, x21, [x_1] - ldp x0, x1, [resx] - csel x0, x20, x0, lo - csel x1, x21, x1, lo - ldp x20, x21, [x_2] - csel x0, x20, x0, hi - csel x1, x21, x1, hi - - ldp x20, x21, [x_1+16] - ldp x2, x3, [resx+16] - csel x2, x20, x2, lo - csel x3, x21, x3, lo - ldp x20, x21, [x_2+16] - csel x2, x20, x2, hi - csel x3, x21, x3, hi - - ldp x20, x21, [x_1+32] - ldp x4, x5, [resx+32] - csel x4, x20, x4, lo - csel x5, x21, x5, lo - ldp x20, x21, [x_2+32] - csel x4, x20, x4, hi - csel x5, x21, x5, hi - - ldp x20, x21, [x_1+48] - ldp x6, x7, [resx+48] - csel x6, x20, x6, lo - csel x7, x21, x7, lo - ldp x20, x21, [x_2+48] - csel x6, x20, x6, hi - csel x7, x21, x7, hi - - ldr x20, [x_1+64] - ldr x8, [resx+64] - csel x8, x20, x8, lo - ldr x21, [x_2+64] - csel x8, x21, x8, hi - - - ldp x20, x21, [y_1] - ldp x10, x11, [resy] - csel x10, x20, x10, lo - csel x11, x21, x11, lo - ldp x20, x21, [y_2] - csel x10, x20, x10, hi - csel x11, x21, x11, hi - - ldp x20, x21, [y_1+16] - ldp x12, x13, [resy+16] - csel x12, x20, x12, lo - csel x13, x21, x13, lo - ldp x20, x21, [y_2+16] - csel x12, x20, x12, hi - csel x13, x21, x13, hi - - ldp x20, x21, [y_1+32] - ldp x14, x15, [resy+32] - csel x14, x20, x14, lo - csel x15, x21, x15, lo - ldp x20, x21, [y_2+32] - csel x14, x20, x14, hi - csel x15, x21, x15, hi - - ldp x20, x21, [y_1+48] - ldp x16, x17, [resy+48] - csel x16, x20, x16, lo - csel x17, x21, x17, lo - ldp x20, x21, [y_2+48] - csel x16, x20, x16, hi - csel x17, x21, x17, hi - - ldr x20, [y_1+64] - ldr x19, [resy+64] - csel x19, x20, x19, lo - ldr x21, [y_2+64] - csel x19, x21, x19, hi - -// Finally store back the multiplexed values - - stp x0, x1, [x_3] - stp x2, x3, [x_3+16] - stp x4, x5, [x_3+32] - stp x6, x7, [x_3+48] - str x8, [x_3+64] - - ldp x0, x1, [resz] - ldp x2, x3, [resz+16] - ldp x4, x5, [resz+32] - ldp x6, x7, [resz+48] - ldr x8, [resz+64] - - stp x10, x11, [y_3] - stp x12, x13, [y_3+16] - stp x14, x15, [y_3+32] - stp x16, x17, [y_3+48] - str x19, [y_3+64] - - stp x0, x1, [z_3] - stp x2, x3, [z_3+16] - stp x4, x5, [z_3+32] - stp x6, x7, [z_3+48] - str x8, [z_3+64] - -// Restore stack and registers - - add sp, sp, NSPACE - - ldp x27, x28, [sp], 16 - ldp x25, x26, [sp], 16 - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits -#endif diff --git a/third_party/s2n-bignum/arm/p521/p521_jdouble_alt.S b/third_party/s2n-bignum/arm/p521/p521_jdouble_alt.S deleted file mode 100644 index fa61dcf8d9e..00000000000 --- a/third_party/s2n-bignum/arm/p521/p521_jdouble_alt.S +++ /dev/null @@ -1,1458 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Point doubling on NIST curve P-521 in Jacobian coordinates -// -// extern void p521_jdouble_alt -// (uint64_t p3[static 27],uint64_t p1[static 27]); -// -// Does p3 := 2 * p1 where all points are regarded as Jacobian triples. -// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). -// It is assumed that all coordinates of the input point are fully -// reduced mod p_521 and that the z coordinate is not zero. -// -// Standard ARM ABI: X0 = p3, X1 = p1 -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jdouble_alt) - S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jdouble_alt) - - .text - .balign 4 - -// Size of individual field elements - -#define NUMSIZE 72 - -// Stable homes for input arguments during main code sequence - -#define input_z x26 -#define input_x x27 - -// Pointer-offset pairs for inputs and outputs - -#define x_1 input_x, #0 -#define y_1 input_x, #NUMSIZE -#define z_1 input_x, #(2*NUMSIZE) - -#define x_3 input_z, #0 -#define y_3 input_z, #NUMSIZE -#define z_3 input_z, #(2*NUMSIZE) - -// Pointer-offset pairs for temporaries - -#define z2 sp, #(NUMSIZE*0) -#define y2 sp, #(NUMSIZE*1) -#define x2p sp, #(NUMSIZE*2) -#define xy2 sp, #(NUMSIZE*3) - -#define y4 sp, #(NUMSIZE*4) -#define t2 sp, #(NUMSIZE*4) - -#define dx2 sp, #(NUMSIZE*5) -#define t1 sp, #(NUMSIZE*5) - -#define d sp, #(NUMSIZE*6) -#define x4p sp, #(NUMSIZE*6) - -// NUMSIZE*7 is not 16-aligned so we round it up - -#define NSPACE (NUMSIZE*7+8) - -// Corresponds exactly to bignum_mul_p521_alt - -#define mul_p521(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x15, x3, x5; \ - umulh x16, x3, x5; \ - mul x14, x3, x6; \ - umulh x17, x3, x6; \ - adds x16, x16, x14; \ - ldp x7, x8, [P2+16]; \ - mul x14, x3, x7; \ - umulh x19, x3, x7; \ - adcs x17, x17, x14; \ - mul x14, x3, x8; \ - umulh x20, x3, x8; \ - adcs x19, x19, x14; \ - ldp x9, x10, [P2+32]; \ - mul x14, x3, x9; \ - umulh x21, x3, x9; \ - adcs x20, x20, x14; \ - mul x14, x3, x10; \ - umulh x22, x3, x10; \ - adcs x21, x21, x14; \ - ldp x11, x12, [P2+48]; \ - mul x14, x3, x11; \ - umulh x23, x3, x11; \ - adcs x22, x22, x14; \ - ldr x13, [P2+64]; \ - mul x14, x3, x12; \ - umulh x24, x3, x12; \ - adcs x23, x23, x14; \ - mul x14, x3, x13; \ - umulh x1, x3, x13; \ - adcs x24, x24, x14; \ - adc x1, x1, xzr; \ - mul x14, x4, x5; \ - adds x16, x16, x14; \ - mul x14, x4, x6; \ - adcs x17, x17, x14; \ - mul x14, x4, x7; \ - adcs x19, x19, x14; \ - mul x14, x4, x8; \ - adcs x20, x20, x14; \ - mul x14, x4, x9; \ - adcs x21, x21, x14; \ - mul x14, x4, x10; \ - adcs x22, x22, x14; \ - mul x14, x4, x11; \ - adcs x23, x23, x14; \ - mul x14, x4, x12; \ - adcs x24, x24, x14; \ - mul x14, x4, x13; \ - adcs x1, x1, x14; \ - cset x0, hs; \ - umulh x14, x4, x5; \ - adds x17, x17, x14; \ - umulh x14, x4, x6; \ - adcs x19, x19, x14; \ - umulh x14, x4, x7; \ - adcs x20, x20, x14; \ - umulh x14, x4, x8; \ - adcs x21, x21, x14; \ - umulh x14, x4, x9; \ - adcs x22, x22, x14; \ - umulh x14, x4, x10; \ - adcs x23, x23, x14; \ - umulh x14, x4, x11; \ - adcs x24, x24, x14; \ - umulh x14, x4, x12; \ - adcs x1, x1, x14; \ - umulh x14, x4, x13; \ - adc x0, x0, x14; \ - stp x15, x16, [P0]; \ - ldp x3, x4, [P1+16]; \ - mul x14, x3, x5; \ - adds x17, x17, x14; \ - mul x14, x3, x6; \ - adcs x19, x19, x14; \ - mul x14, x3, x7; \ - adcs x20, x20, x14; \ - mul x14, x3, x8; \ - adcs x21, x21, x14; \ - mul x14, x3, x9; \ - adcs x22, x22, x14; \ - mul x14, x3, x10; \ - adcs x23, x23, x14; \ - mul x14, x3, x11; \ - adcs x24, x24, x14; \ - mul x14, x3, x12; \ - adcs x1, x1, x14; \ - mul x14, x3, x13; \ - adcs x0, x0, x14; \ - cset x15, hs; \ - umulh x14, x3, x5; \ - adds x19, x19, x14; \ - umulh x14, x3, x6; \ - adcs x20, x20, x14; \ - umulh x14, x3, x7; \ - adcs x21, x21, x14; \ - umulh x14, x3, x8; \ - adcs x22, x22, x14; \ - umulh x14, x3, x9; \ - adcs x23, x23, x14; \ - umulh x14, x3, x10; \ - adcs x24, x24, x14; \ - umulh x14, x3, x11; \ - adcs x1, x1, x14; \ - umulh x14, x3, x12; \ - adcs x0, x0, x14; \ - umulh x14, x3, x13; \ - adc x15, x15, x14; \ - mul x14, x4, x5; \ - adds x19, x19, x14; \ - mul x14, x4, x6; \ - adcs x20, x20, x14; \ - mul x14, x4, x7; \ - adcs x21, x21, x14; \ - mul x14, x4, x8; \ - adcs x22, x22, x14; \ - mul x14, x4, x9; \ - adcs x23, x23, x14; \ - mul x14, x4, x10; \ - adcs x24, x24, x14; \ - mul x14, x4, x11; \ - adcs x1, x1, x14; \ - mul x14, x4, x12; \ - adcs x0, x0, x14; \ - mul x14, x4, x13; \ - adcs x15, x15, x14; \ - cset x16, hs; \ - umulh x14, x4, x5; \ - adds x20, x20, x14; \ - umulh x14, x4, x6; \ - adcs x21, x21, x14; \ - umulh x14, x4, x7; \ - adcs x22, x22, x14; \ - umulh x14, x4, x8; \ - adcs x23, x23, x14; \ - umulh x14, x4, x9; \ - adcs x24, x24, x14; \ - umulh x14, x4, x10; \ - adcs x1, x1, x14; \ - umulh x14, x4, x11; \ - adcs x0, x0, x14; \ - umulh x14, x4, x12; \ - adcs x15, x15, x14; \ - umulh x14, x4, x13; \ - adc x16, x16, x14; \ - stp x17, x19, [P0+16]; \ - ldp x3, x4, [P1+32]; \ - mul x14, x3, x5; \ - adds x20, x20, x14; \ - mul x14, x3, x6; \ - adcs x21, x21, x14; \ - mul x14, x3, x7; \ - adcs x22, x22, x14; \ - mul x14, x3, x8; \ - adcs x23, x23, x14; \ - mul x14, x3, x9; \ - adcs x24, x24, x14; \ - mul x14, x3, x10; \ - adcs x1, x1, x14; \ - mul x14, x3, x11; \ - adcs x0, x0, x14; \ - mul x14, x3, x12; \ - adcs x15, x15, x14; \ - mul x14, x3, x13; \ - adcs x16, x16, x14; \ - cset x17, hs; \ - umulh x14, x3, x5; \ - adds x21, x21, x14; \ - umulh x14, x3, x6; \ - adcs x22, x22, x14; \ - umulh x14, x3, x7; \ - adcs x23, x23, x14; \ - umulh x14, x3, x8; \ - adcs x24, x24, x14; \ - umulh x14, x3, x9; \ - adcs x1, x1, x14; \ - umulh x14, x3, x10; \ - adcs x0, x0, x14; \ - umulh x14, x3, x11; \ - adcs x15, x15, x14; \ - umulh x14, x3, x12; \ - adcs x16, x16, x14; \ - umulh x14, x3, x13; \ - adc x17, x17, x14; \ - mul x14, x4, x5; \ - adds x21, x21, x14; \ - mul x14, x4, x6; \ - adcs x22, x22, x14; \ - mul x14, x4, x7; \ - adcs x23, x23, x14; \ - mul x14, x4, x8; \ - adcs x24, x24, x14; \ - mul x14, x4, x9; \ - adcs x1, x1, x14; \ - mul x14, x4, x10; \ - adcs x0, x0, x14; \ - mul x14, x4, x11; \ - adcs x15, x15, x14; \ - mul x14, x4, x12; \ - adcs x16, x16, x14; \ - mul x14, x4, x13; \ - adcs x17, x17, x14; \ - cset x19, hs; \ - umulh x14, x4, x5; \ - adds x22, x22, x14; \ - umulh x14, x4, x6; \ - adcs x23, x23, x14; \ - umulh x14, x4, x7; \ - adcs x24, x24, x14; \ - umulh x14, x4, x8; \ - adcs x1, x1, x14; \ - umulh x14, x4, x9; \ - adcs x0, x0, x14; \ - umulh x14, x4, x10; \ - adcs x15, x15, x14; \ - umulh x14, x4, x11; \ - adcs x16, x16, x14; \ - umulh x14, x4, x12; \ - adcs x17, x17, x14; \ - umulh x14, x4, x13; \ - adc x19, x19, x14; \ - stp x20, x21, [P0+32]; \ - ldp x3, x4, [P1+48]; \ - mul x14, x3, x5; \ - adds x22, x22, x14; \ - mul x14, x3, x6; \ - adcs x23, x23, x14; \ - mul x14, x3, x7; \ - adcs x24, x24, x14; \ - mul x14, x3, x8; \ - adcs x1, x1, x14; \ - mul x14, x3, x9; \ - adcs x0, x0, x14; \ - mul x14, x3, x10; \ - adcs x15, x15, x14; \ - mul x14, x3, x11; \ - adcs x16, x16, x14; \ - mul x14, x3, x12; \ - adcs x17, x17, x14; \ - mul x14, x3, x13; \ - adcs x19, x19, x14; \ - cset x20, hs; \ - umulh x14, x3, x5; \ - adds x23, x23, x14; \ - umulh x14, x3, x6; \ - adcs x24, x24, x14; \ - umulh x14, x3, x7; \ - adcs x1, x1, x14; \ - umulh x14, x3, x8; \ - adcs x0, x0, x14; \ - umulh x14, x3, x9; \ - adcs x15, x15, x14; \ - umulh x14, x3, x10; \ - adcs x16, x16, x14; \ - umulh x14, x3, x11; \ - adcs x17, x17, x14; \ - umulh x14, x3, x12; \ - adcs x19, x19, x14; \ - umulh x14, x3, x13; \ - adc x20, x20, x14; \ - mul x14, x4, x5; \ - adds x23, x23, x14; \ - mul x14, x4, x6; \ - adcs x24, x24, x14; \ - mul x14, x4, x7; \ - adcs x1, x1, x14; \ - mul x14, x4, x8; \ - adcs x0, x0, x14; \ - mul x14, x4, x9; \ - adcs x15, x15, x14; \ - mul x14, x4, x10; \ - adcs x16, x16, x14; \ - mul x14, x4, x11; \ - adcs x17, x17, x14; \ - mul x14, x4, x12; \ - adcs x19, x19, x14; \ - mul x14, x4, x13; \ - adcs x20, x20, x14; \ - cset x21, hs; \ - umulh x14, x4, x5; \ - adds x24, x24, x14; \ - umulh x14, x4, x6; \ - adcs x1, x1, x14; \ - umulh x14, x4, x7; \ - adcs x0, x0, x14; \ - umulh x14, x4, x8; \ - adcs x15, x15, x14; \ - umulh x14, x4, x9; \ - adcs x16, x16, x14; \ - umulh x14, x4, x10; \ - adcs x17, x17, x14; \ - umulh x14, x4, x11; \ - adcs x19, x19, x14; \ - umulh x14, x4, x12; \ - adcs x20, x20, x14; \ - umulh x14, x4, x13; \ - adc x21, x21, x14; \ - stp x22, x23, [P0+48]; \ - ldr x3, [P1+64]; \ - mul x14, x3, x5; \ - adds x24, x24, x14; \ - mul x14, x3, x6; \ - adcs x1, x1, x14; \ - mul x14, x3, x7; \ - adcs x0, x0, x14; \ - mul x14, x3, x8; \ - adcs x15, x15, x14; \ - mul x14, x3, x9; \ - adcs x16, x16, x14; \ - mul x14, x3, x10; \ - adcs x17, x17, x14; \ - mul x14, x3, x11; \ - adcs x19, x19, x14; \ - mul x14, x3, x12; \ - adcs x20, x20, x14; \ - mul x14, x3, x13; \ - adc x21, x21, x14; \ - umulh x14, x3, x5; \ - adds x1, x1, x14; \ - umulh x14, x3, x6; \ - adcs x0, x0, x14; \ - umulh x14, x3, x7; \ - adcs x15, x15, x14; \ - umulh x14, x3, x8; \ - adcs x16, x16, x14; \ - umulh x14, x3, x9; \ - adcs x17, x17, x14; \ - umulh x14, x3, x10; \ - adcs x19, x19, x14; \ - umulh x14, x3, x11; \ - adcs x20, x20, x14; \ - umulh x14, x3, x12; \ - adc x21, x21, x14; \ - cmp xzr, xzr; \ - ldp x5, x6, [P0]; \ - extr x14, x1, x24, #9; \ - adcs x5, x5, x14; \ - extr x14, x0, x1, #9; \ - adcs x6, x6, x14; \ - ldp x7, x8, [P0+16]; \ - extr x14, x15, x0, #9; \ - adcs x7, x7, x14; \ - extr x14, x16, x15, #9; \ - adcs x8, x8, x14; \ - ldp x9, x10, [P0+32]; \ - extr x14, x17, x16, #9; \ - adcs x9, x9, x14; \ - extr x14, x19, x17, #9; \ - adcs x10, x10, x14; \ - ldp x11, x12, [P0+48]; \ - extr x14, x20, x19, #9; \ - adcs x11, x11, x14; \ - extr x14, x21, x20, #9; \ - adcs x12, x12, x14; \ - orr x13, x24, #0xfffffffffffffe00; \ - lsr x14, x21, #9; \ - adcs x13, x13, x14; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - -// Corresponds exactly to bignum_sqr_p521_alt - -#define sqr_p521(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x11, x2, x3; \ - umulh x12, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x10, x2, x4; \ - umulh x13, x2, x4; \ - adds x12, x12, x10; \ - ldp x6, x7, [P1+32]; \ - mul x10, x2, x5; \ - umulh x14, x2, x5; \ - adcs x13, x13, x10; \ - ldp x8, x9, [P1+48]; \ - mul x10, x2, x6; \ - umulh x15, x2, x6; \ - adcs x14, x14, x10; \ - mul x10, x2, x7; \ - umulh x16, x2, x7; \ - adcs x15, x15, x10; \ - mul x10, x2, x8; \ - umulh x17, x2, x8; \ - adcs x16, x16, x10; \ - mul x10, x2, x9; \ - umulh x19, x2, x9; \ - adcs x17, x17, x10; \ - adc x19, x19, xzr; \ - mul x10, x3, x4; \ - adds x13, x13, x10; \ - mul x10, x3, x5; \ - adcs x14, x14, x10; \ - mul x10, x3, x6; \ - adcs x15, x15, x10; \ - mul x10, x3, x7; \ - adcs x16, x16, x10; \ - mul x10, x3, x8; \ - adcs x17, x17, x10; \ - mul x10, x3, x9; \ - adcs x19, x19, x10; \ - cset x20, hs; \ - umulh x10, x3, x4; \ - adds x14, x14, x10; \ - umulh x10, x3, x5; \ - adcs x15, x15, x10; \ - umulh x10, x3, x6; \ - adcs x16, x16, x10; \ - umulh x10, x3, x7; \ - adcs x17, x17, x10; \ - umulh x10, x3, x8; \ - adcs x19, x19, x10; \ - umulh x10, x3, x9; \ - adc x20, x20, x10; \ - mul x10, x6, x7; \ - umulh x21, x6, x7; \ - adds x20, x20, x10; \ - adc x21, x21, xzr; \ - mul x10, x4, x5; \ - adds x15, x15, x10; \ - mul x10, x4, x6; \ - adcs x16, x16, x10; \ - mul x10, x4, x7; \ - adcs x17, x17, x10; \ - mul x10, x4, x8; \ - adcs x19, x19, x10; \ - mul x10, x4, x9; \ - adcs x20, x20, x10; \ - mul x10, x6, x8; \ - adcs x21, x21, x10; \ - cset x22, hs; \ - umulh x10, x4, x5; \ - adds x16, x16, x10; \ - umulh x10, x4, x6; \ - adcs x17, x17, x10; \ - umulh x10, x4, x7; \ - adcs x19, x19, x10; \ - umulh x10, x4, x8; \ - adcs x20, x20, x10; \ - umulh x10, x4, x9; \ - adcs x21, x21, x10; \ - umulh x10, x6, x8; \ - adc x22, x22, x10; \ - mul x10, x7, x8; \ - umulh x23, x7, x8; \ - adds x22, x22, x10; \ - adc x23, x23, xzr; \ - mul x10, x5, x6; \ - adds x17, x17, x10; \ - mul x10, x5, x7; \ - adcs x19, x19, x10; \ - mul x10, x5, x8; \ - adcs x20, x20, x10; \ - mul x10, x5, x9; \ - adcs x21, x21, x10; \ - mul x10, x6, x9; \ - adcs x22, x22, x10; \ - mul x10, x7, x9; \ - adcs x23, x23, x10; \ - cset x24, hs; \ - umulh x10, x5, x6; \ - adds x19, x19, x10; \ - umulh x10, x5, x7; \ - adcs x20, x20, x10; \ - umulh x10, x5, x8; \ - adcs x21, x21, x10; \ - umulh x10, x5, x9; \ - adcs x22, x22, x10; \ - umulh x10, x6, x9; \ - adcs x23, x23, x10; \ - umulh x10, x7, x9; \ - adc x24, x24, x10; \ - mul x10, x8, x9; \ - umulh x25, x8, x9; \ - adds x24, x24, x10; \ - adc x25, x25, xzr; \ - adds x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - adcs x20, x20, x20; \ - adcs x21, x21, x21; \ - adcs x22, x22, x22; \ - adcs x23, x23, x23; \ - adcs x24, x24, x24; \ - adcs x25, x25, x25; \ - cset x0, hs; \ - umulh x10, x2, x2; \ - adds x11, x11, x10; \ - mul x10, x3, x3; \ - adcs x12, x12, x10; \ - umulh x10, x3, x3; \ - adcs x13, x13, x10; \ - mul x10, x4, x4; \ - adcs x14, x14, x10; \ - umulh x10, x4, x4; \ - adcs x15, x15, x10; \ - mul x10, x5, x5; \ - adcs x16, x16, x10; \ - umulh x10, x5, x5; \ - adcs x17, x17, x10; \ - mul x10, x6, x6; \ - adcs x19, x19, x10; \ - umulh x10, x6, x6; \ - adcs x20, x20, x10; \ - mul x10, x7, x7; \ - adcs x21, x21, x10; \ - umulh x10, x7, x7; \ - adcs x22, x22, x10; \ - mul x10, x8, x8; \ - adcs x23, x23, x10; \ - umulh x10, x8, x8; \ - adcs x24, x24, x10; \ - mul x10, x9, x9; \ - adcs x25, x25, x10; \ - umulh x10, x9, x9; \ - adc x0, x0, x10; \ - ldr x1, [P1+64]; \ - add x1, x1, x1; \ - mul x10, x1, x2; \ - adds x19, x19, x10; \ - umulh x10, x1, x2; \ - adcs x20, x20, x10; \ - mul x10, x1, x4; \ - adcs x21, x21, x10; \ - umulh x10, x1, x4; \ - adcs x22, x22, x10; \ - mul x10, x1, x6; \ - adcs x23, x23, x10; \ - umulh x10, x1, x6; \ - adcs x24, x24, x10; \ - mul x10, x1, x8; \ - adcs x25, x25, x10; \ - umulh x10, x1, x8; \ - adcs x0, x0, x10; \ - lsr x4, x1, #1; \ - mul x4, x4, x4; \ - adc x4, x4, xzr; \ - mul x10, x1, x3; \ - adds x20, x20, x10; \ - umulh x10, x1, x3; \ - adcs x21, x21, x10; \ - mul x10, x1, x5; \ - adcs x22, x22, x10; \ - umulh x10, x1, x5; \ - adcs x23, x23, x10; \ - mul x10, x1, x7; \ - adcs x24, x24, x10; \ - umulh x10, x1, x7; \ - adcs x25, x25, x10; \ - mul x10, x1, x9; \ - adcs x0, x0, x10; \ - umulh x10, x1, x9; \ - adc x4, x4, x10; \ - mul x2, x2, x2; \ - cmp xzr, xzr; \ - extr x10, x20, x19, #9; \ - adcs x2, x2, x10; \ - extr x10, x21, x20, #9; \ - adcs x11, x11, x10; \ - extr x10, x22, x21, #9; \ - adcs x12, x12, x10; \ - extr x10, x23, x22, #9; \ - adcs x13, x13, x10; \ - extr x10, x24, x23, #9; \ - adcs x14, x14, x10; \ - extr x10, x25, x24, #9; \ - adcs x15, x15, x10; \ - extr x10, x0, x25, #9; \ - adcs x16, x16, x10; \ - extr x10, x4, x0, #9; \ - adcs x17, x17, x10; \ - orr x19, x19, #0xfffffffffffffe00; \ - lsr x10, x4, #9; \ - adcs x19, x19, x10; \ - sbcs x2, x2, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x19, x19, xzr; \ - and x19, x19, #0x1ff; \ - stp x2, x11, [P0]; \ - stp x12, x13, [P0+16]; \ - stp x14, x15, [P0+32]; \ - stp x16, x17, [P0+48]; \ - str x19, [P0+64] - -// Corresponds exactly to bignum_add_p521 - -#define add_p521(P0,P1,P2) \ - cmp xzr, xzr; \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - adcs x5, x5, x4; \ - adcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - adcs x7, x7, x4; \ - adcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - adcs x9, x9, x4; \ - adcs x10, x10, x3; \ - ldp x11, x12, [P1+48]; \ - ldp x4, x3, [P2+48]; \ - adcs x11, x11, x4; \ - adcs x12, x12, x3; \ - ldr x13, [P1+64]; \ - ldr x4, [P2+64]; \ - adc x13, x13, x4; \ - subs x4, x13, #512; \ - csetm x4, hs; \ - sbcs x5, x5, xzr; \ - and x4, x4, #0x200; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, x4; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - -// Corresponds exactly to bignum_sub_p521 - -#define sub_p521(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - ldp x11, x12, [P1+48]; \ - ldp x4, x3, [P2+48]; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - ldr x13, [P1+64]; \ - ldr x4, [P2+64]; \ - sbcs x13, x13, x4; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - -// Weak multiplication not fully reducing - -#define weakmul_p521(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x15, x3, x5; \ - umulh x16, x3, x5; \ - mul x14, x3, x6; \ - umulh x17, x3, x6; \ - adds x16, x16, x14; \ - ldp x7, x8, [P2+16]; \ - mul x14, x3, x7; \ - umulh x19, x3, x7; \ - adcs x17, x17, x14; \ - mul x14, x3, x8; \ - umulh x20, x3, x8; \ - adcs x19, x19, x14; \ - ldp x9, x10, [P2+32]; \ - mul x14, x3, x9; \ - umulh x21, x3, x9; \ - adcs x20, x20, x14; \ - mul x14, x3, x10; \ - umulh x22, x3, x10; \ - adcs x21, x21, x14; \ - ldp x11, x12, [P2+48]; \ - mul x14, x3, x11; \ - umulh x23, x3, x11; \ - adcs x22, x22, x14; \ - ldr x13, [P2+64]; \ - mul x14, x3, x12; \ - umulh x24, x3, x12; \ - adcs x23, x23, x14; \ - mul x14, x3, x13; \ - umulh x1, x3, x13; \ - adcs x24, x24, x14; \ - adc x1, x1, xzr; \ - mul x14, x4, x5; \ - adds x16, x16, x14; \ - mul x14, x4, x6; \ - adcs x17, x17, x14; \ - mul x14, x4, x7; \ - adcs x19, x19, x14; \ - mul x14, x4, x8; \ - adcs x20, x20, x14; \ - mul x14, x4, x9; \ - adcs x21, x21, x14; \ - mul x14, x4, x10; \ - adcs x22, x22, x14; \ - mul x14, x4, x11; \ - adcs x23, x23, x14; \ - mul x14, x4, x12; \ - adcs x24, x24, x14; \ - mul x14, x4, x13; \ - adcs x1, x1, x14; \ - cset x0, hs; \ - umulh x14, x4, x5; \ - adds x17, x17, x14; \ - umulh x14, x4, x6; \ - adcs x19, x19, x14; \ - umulh x14, x4, x7; \ - adcs x20, x20, x14; \ - umulh x14, x4, x8; \ - adcs x21, x21, x14; \ - umulh x14, x4, x9; \ - adcs x22, x22, x14; \ - umulh x14, x4, x10; \ - adcs x23, x23, x14; \ - umulh x14, x4, x11; \ - adcs x24, x24, x14; \ - umulh x14, x4, x12; \ - adcs x1, x1, x14; \ - umulh x14, x4, x13; \ - adc x0, x0, x14; \ - stp x15, x16, [P0]; \ - ldp x3, x4, [P1+16]; \ - mul x14, x3, x5; \ - adds x17, x17, x14; \ - mul x14, x3, x6; \ - adcs x19, x19, x14; \ - mul x14, x3, x7; \ - adcs x20, x20, x14; \ - mul x14, x3, x8; \ - adcs x21, x21, x14; \ - mul x14, x3, x9; \ - adcs x22, x22, x14; \ - mul x14, x3, x10; \ - adcs x23, x23, x14; \ - mul x14, x3, x11; \ - adcs x24, x24, x14; \ - mul x14, x3, x12; \ - adcs x1, x1, x14; \ - mul x14, x3, x13; \ - adcs x0, x0, x14; \ - cset x15, hs; \ - umulh x14, x3, x5; \ - adds x19, x19, x14; \ - umulh x14, x3, x6; \ - adcs x20, x20, x14; \ - umulh x14, x3, x7; \ - adcs x21, x21, x14; \ - umulh x14, x3, x8; \ - adcs x22, x22, x14; \ - umulh x14, x3, x9; \ - adcs x23, x23, x14; \ - umulh x14, x3, x10; \ - adcs x24, x24, x14; \ - umulh x14, x3, x11; \ - adcs x1, x1, x14; \ - umulh x14, x3, x12; \ - adcs x0, x0, x14; \ - umulh x14, x3, x13; \ - adc x15, x15, x14; \ - mul x14, x4, x5; \ - adds x19, x19, x14; \ - mul x14, x4, x6; \ - adcs x20, x20, x14; \ - mul x14, x4, x7; \ - adcs x21, x21, x14; \ - mul x14, x4, x8; \ - adcs x22, x22, x14; \ - mul x14, x4, x9; \ - adcs x23, x23, x14; \ - mul x14, x4, x10; \ - adcs x24, x24, x14; \ - mul x14, x4, x11; \ - adcs x1, x1, x14; \ - mul x14, x4, x12; \ - adcs x0, x0, x14; \ - mul x14, x4, x13; \ - adcs x15, x15, x14; \ - cset x16, hs; \ - umulh x14, x4, x5; \ - adds x20, x20, x14; \ - umulh x14, x4, x6; \ - adcs x21, x21, x14; \ - umulh x14, x4, x7; \ - adcs x22, x22, x14; \ - umulh x14, x4, x8; \ - adcs x23, x23, x14; \ - umulh x14, x4, x9; \ - adcs x24, x24, x14; \ - umulh x14, x4, x10; \ - adcs x1, x1, x14; \ - umulh x14, x4, x11; \ - adcs x0, x0, x14; \ - umulh x14, x4, x12; \ - adcs x15, x15, x14; \ - umulh x14, x4, x13; \ - adc x16, x16, x14; \ - stp x17, x19, [P0+16]; \ - ldp x3, x4, [P1+32]; \ - mul x14, x3, x5; \ - adds x20, x20, x14; \ - mul x14, x3, x6; \ - adcs x21, x21, x14; \ - mul x14, x3, x7; \ - adcs x22, x22, x14; \ - mul x14, x3, x8; \ - adcs x23, x23, x14; \ - mul x14, x3, x9; \ - adcs x24, x24, x14; \ - mul x14, x3, x10; \ - adcs x1, x1, x14; \ - mul x14, x3, x11; \ - adcs x0, x0, x14; \ - mul x14, x3, x12; \ - adcs x15, x15, x14; \ - mul x14, x3, x13; \ - adcs x16, x16, x14; \ - cset x17, hs; \ - umulh x14, x3, x5; \ - adds x21, x21, x14; \ - umulh x14, x3, x6; \ - adcs x22, x22, x14; \ - umulh x14, x3, x7; \ - adcs x23, x23, x14; \ - umulh x14, x3, x8; \ - adcs x24, x24, x14; \ - umulh x14, x3, x9; \ - adcs x1, x1, x14; \ - umulh x14, x3, x10; \ - adcs x0, x0, x14; \ - umulh x14, x3, x11; \ - adcs x15, x15, x14; \ - umulh x14, x3, x12; \ - adcs x16, x16, x14; \ - umulh x14, x3, x13; \ - adc x17, x17, x14; \ - mul x14, x4, x5; \ - adds x21, x21, x14; \ - mul x14, x4, x6; \ - adcs x22, x22, x14; \ - mul x14, x4, x7; \ - adcs x23, x23, x14; \ - mul x14, x4, x8; \ - adcs x24, x24, x14; \ - mul x14, x4, x9; \ - adcs x1, x1, x14; \ - mul x14, x4, x10; \ - adcs x0, x0, x14; \ - mul x14, x4, x11; \ - adcs x15, x15, x14; \ - mul x14, x4, x12; \ - adcs x16, x16, x14; \ - mul x14, x4, x13; \ - adcs x17, x17, x14; \ - cset x19, hs; \ - umulh x14, x4, x5; \ - adds x22, x22, x14; \ - umulh x14, x4, x6; \ - adcs x23, x23, x14; \ - umulh x14, x4, x7; \ - adcs x24, x24, x14; \ - umulh x14, x4, x8; \ - adcs x1, x1, x14; \ - umulh x14, x4, x9; \ - adcs x0, x0, x14; \ - umulh x14, x4, x10; \ - adcs x15, x15, x14; \ - umulh x14, x4, x11; \ - adcs x16, x16, x14; \ - umulh x14, x4, x12; \ - adcs x17, x17, x14; \ - umulh x14, x4, x13; \ - adc x19, x19, x14; \ - stp x20, x21, [P0+32]; \ - ldp x3, x4, [P1+48]; \ - mul x14, x3, x5; \ - adds x22, x22, x14; \ - mul x14, x3, x6; \ - adcs x23, x23, x14; \ - mul x14, x3, x7; \ - adcs x24, x24, x14; \ - mul x14, x3, x8; \ - adcs x1, x1, x14; \ - mul x14, x3, x9; \ - adcs x0, x0, x14; \ - mul x14, x3, x10; \ - adcs x15, x15, x14; \ - mul x14, x3, x11; \ - adcs x16, x16, x14; \ - mul x14, x3, x12; \ - adcs x17, x17, x14; \ - mul x14, x3, x13; \ - adcs x19, x19, x14; \ - cset x20, hs; \ - umulh x14, x3, x5; \ - adds x23, x23, x14; \ - umulh x14, x3, x6; \ - adcs x24, x24, x14; \ - umulh x14, x3, x7; \ - adcs x1, x1, x14; \ - umulh x14, x3, x8; \ - adcs x0, x0, x14; \ - umulh x14, x3, x9; \ - adcs x15, x15, x14; \ - umulh x14, x3, x10; \ - adcs x16, x16, x14; \ - umulh x14, x3, x11; \ - adcs x17, x17, x14; \ - umulh x14, x3, x12; \ - adcs x19, x19, x14; \ - umulh x14, x3, x13; \ - adc x20, x20, x14; \ - mul x14, x4, x5; \ - adds x23, x23, x14; \ - mul x14, x4, x6; \ - adcs x24, x24, x14; \ - mul x14, x4, x7; \ - adcs x1, x1, x14; \ - mul x14, x4, x8; \ - adcs x0, x0, x14; \ - mul x14, x4, x9; \ - adcs x15, x15, x14; \ - mul x14, x4, x10; \ - adcs x16, x16, x14; \ - mul x14, x4, x11; \ - adcs x17, x17, x14; \ - mul x14, x4, x12; \ - adcs x19, x19, x14; \ - mul x14, x4, x13; \ - adcs x20, x20, x14; \ - cset x21, hs; \ - umulh x14, x4, x5; \ - adds x24, x24, x14; \ - umulh x14, x4, x6; \ - adcs x1, x1, x14; \ - umulh x14, x4, x7; \ - adcs x0, x0, x14; \ - umulh x14, x4, x8; \ - adcs x15, x15, x14; \ - umulh x14, x4, x9; \ - adcs x16, x16, x14; \ - umulh x14, x4, x10; \ - adcs x17, x17, x14; \ - umulh x14, x4, x11; \ - adcs x19, x19, x14; \ - umulh x14, x4, x12; \ - adcs x20, x20, x14; \ - umulh x14, x4, x13; \ - adc x21, x21, x14; \ - stp x22, x23, [P0+48]; \ - ldr x3, [P1+64]; \ - mul x14, x3, x5; \ - adds x24, x24, x14; \ - mul x14, x3, x6; \ - adcs x1, x1, x14; \ - mul x14, x3, x7; \ - adcs x0, x0, x14; \ - mul x14, x3, x8; \ - adcs x15, x15, x14; \ - mul x14, x3, x9; \ - adcs x16, x16, x14; \ - mul x14, x3, x10; \ - adcs x17, x17, x14; \ - mul x14, x3, x11; \ - adcs x19, x19, x14; \ - mul x14, x3, x12; \ - adcs x20, x20, x14; \ - mul x14, x3, x13; \ - adc x21, x21, x14; \ - umulh x14, x3, x5; \ - adds x1, x1, x14; \ - umulh x14, x3, x6; \ - adcs x0, x0, x14; \ - umulh x14, x3, x7; \ - adcs x15, x15, x14; \ - umulh x14, x3, x8; \ - adcs x16, x16, x14; \ - umulh x14, x3, x9; \ - adcs x17, x17, x14; \ - umulh x14, x3, x10; \ - adcs x19, x19, x14; \ - umulh x14, x3, x11; \ - adcs x20, x20, x14; \ - umulh x14, x3, x12; \ - adc x21, x21, x14; \ - ldp x5, x6, [P0]; \ - extr x14, x1, x24, #9; \ - adds x5, x5, x14; \ - extr x14, x0, x1, #9; \ - adcs x6, x6, x14; \ - ldp x7, x8, [P0+16]; \ - extr x14, x15, x0, #9; \ - adcs x7, x7, x14; \ - extr x14, x16, x15, #9; \ - adcs x8, x8, x14; \ - ldp x9, x10, [P0+32]; \ - extr x14, x17, x16, #9; \ - adcs x9, x9, x14; \ - extr x14, x19, x17, #9; \ - adcs x10, x10, x14; \ - ldp x11, x12, [P0+48]; \ - extr x14, x20, x19, #9; \ - adcs x11, x11, x14; \ - extr x14, x21, x20, #9; \ - adcs x12, x12, x14; \ - and x13, x24, #0x1ff; \ - lsr x14, x21, #9; \ - adc x13, x13, x14; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - -// P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2) - -#define cmsub_p521(P0,C,P1,D,P2) \ - ldp x6, x7, [P1]; \ - mov x1, #(C); \ - mul x3, x1, x6; \ - mul x4, x1, x7; \ - umulh x6, x1, x6; \ - adds x4, x4, x6; \ - umulh x7, x1, x7; \ - ldp x8, x9, [P1+16]; \ - mul x5, x1, x8; \ - mul x6, x1, x9; \ - umulh x8, x1, x8; \ - adcs x5, x5, x7; \ - umulh x9, x1, x9; \ - adcs x6, x6, x8; \ - ldp x10, x11, [P1+32]; \ - mul x7, x1, x10; \ - mul x8, x1, x11; \ - umulh x10, x1, x10; \ - adcs x7, x7, x9; \ - umulh x11, x1, x11; \ - adcs x8, x8, x10; \ - ldp x12, x13, [P1+48]; \ - mul x9, x1, x12; \ - mul x10, x1, x13; \ - umulh x12, x1, x12; \ - adcs x9, x9, x11; \ - umulh x13, x1, x13; \ - adcs x10, x10, x12; \ - ldr x14, [P1+64]; \ - mul x11, x1, x14; \ - adc x11, x11, x13; \ - mov x1, #(D); \ - ldp x20, x21, [P2]; \ - mvn x20, x20; \ - mul x0, x1, x20; \ - umulh x20, x1, x20; \ - adds x3, x3, x0; \ - mvn x21, x21; \ - mul x0, x1, x21; \ - umulh x21, x1, x21; \ - adcs x4, x4, x0; \ - ldp x22, x23, [P2+16]; \ - mvn x22, x22; \ - mul x0, x1, x22; \ - umulh x22, x1, x22; \ - adcs x5, x5, x0; \ - mvn x23, x23; \ - mul x0, x1, x23; \ - umulh x23, x1, x23; \ - adcs x6, x6, x0; \ - ldp x17, x19, [P2+32]; \ - mvn x17, x17; \ - mul x0, x1, x17; \ - umulh x17, x1, x17; \ - adcs x7, x7, x0; \ - mvn x19, x19; \ - mul x0, x1, x19; \ - umulh x19, x1, x19; \ - adcs x8, x8, x0; \ - ldp x2, x16, [P2+48]; \ - mvn x2, x2; \ - mul x0, x1, x2; \ - umulh x2, x1, x2; \ - adcs x9, x9, x0; \ - mvn x16, x16; \ - mul x0, x1, x16; \ - umulh x16, x1, x16; \ - adcs x10, x10, x0; \ - ldr x0, [P2+64]; \ - eor x0, x0, #0x1ff; \ - mul x0, x1, x0; \ - adc x11, x11, x0; \ - adds x4, x4, x20; \ - adcs x5, x5, x21; \ - and x15, x4, x5; \ - adcs x6, x6, x22; \ - and x15, x15, x6; \ - adcs x7, x7, x23; \ - and x15, x15, x7; \ - adcs x8, x8, x17; \ - and x15, x15, x8; \ - adcs x9, x9, x19; \ - and x15, x15, x9; \ - adcs x10, x10, x2; \ - and x15, x15, x10; \ - adc x11, x11, x16; \ - lsr x12, x11, #9; \ - orr x11, x11, #0xfffffffffffffe00; \ - cmp xzr, xzr; \ - adcs xzr, x3, x12; \ - adcs xzr, x15, xzr; \ - adcs xzr, x11, xzr; \ - adcs x3, x3, x12; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adcs x6, x6, xzr; \ - adcs x7, x7, xzr; \ - adcs x8, x8, xzr; \ - adcs x9, x9, xzr; \ - adcs x10, x10, xzr; \ - adc x11, x11, xzr; \ - and x11, x11, #0x1ff; \ - stp x3, x4, [P0]; \ - stp x5, x6, [P0+16]; \ - stp x7, x8, [P0+32]; \ - stp x9, x10, [P0+48]; \ - str x11, [P0+64] - -// P0 = 3 * P1 - 8 * P2 == 3 * P1 + 8 * (p_521 - P2) - -#define cmsub38_p521(P0,P1,P2) \ - ldp x6, x7, [P1]; \ - lsl x3, x6, #1; \ - adds x3, x3, x6; \ - extr x4, x7, x6, #63; \ - adcs x4, x4, x7; \ - ldp x8, x9, [P1+16]; \ - extr x5, x8, x7, #63; \ - adcs x5, x5, x8; \ - extr x6, x9, x8, #63; \ - adcs x6, x6, x9; \ - ldp x10, x11, [P1+32]; \ - extr x7, x10, x9, #63; \ - adcs x7, x7, x10; \ - extr x8, x11, x10, #63; \ - adcs x8, x8, x11; \ - ldp x12, x13, [P1+48]; \ - extr x9, x12, x11, #63; \ - adcs x9, x9, x12; \ - extr x10, x13, x12, #63; \ - adcs x10, x10, x13; \ - ldr x14, [P1+64]; \ - extr x11, x14, x13, #63; \ - adc x11, x11, x14; \ - ldp x20, x21, [P2]; \ - mvn x20, x20; \ - lsl x0, x20, #3; \ - adds x3, x3, x0; \ - mvn x21, x21; \ - extr x0, x21, x20, #61; \ - adcs x4, x4, x0; \ - ldp x22, x23, [P2+16]; \ - mvn x22, x22; \ - extr x0, x22, x21, #61; \ - adcs x5, x5, x0; \ - and x15, x4, x5; \ - mvn x23, x23; \ - extr x0, x23, x22, #61; \ - adcs x6, x6, x0; \ - and x15, x15, x6; \ - ldp x20, x21, [P2+32]; \ - mvn x20, x20; \ - extr x0, x20, x23, #61; \ - adcs x7, x7, x0; \ - and x15, x15, x7; \ - mvn x21, x21; \ - extr x0, x21, x20, #61; \ - adcs x8, x8, x0; \ - and x15, x15, x8; \ - ldp x22, x23, [P2+48]; \ - mvn x22, x22; \ - extr x0, x22, x21, #61; \ - adcs x9, x9, x0; \ - and x15, x15, x9; \ - mvn x23, x23; \ - extr x0, x23, x22, #61; \ - adcs x10, x10, x0; \ - and x15, x15, x10; \ - ldr x0, [P2+64]; \ - eor x0, x0, #0x1ff; \ - extr x0, x0, x23, #61; \ - adc x11, x11, x0; \ - lsr x12, x11, #9; \ - orr x11, x11, #0xfffffffffffffe00; \ - cmp xzr, xzr; \ - adcs xzr, x3, x12; \ - adcs xzr, x15, xzr; \ - adcs xzr, x11, xzr; \ - adcs x3, x3, x12; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adcs x6, x6, xzr; \ - adcs x7, x7, xzr; \ - adcs x8, x8, xzr; \ - adcs x9, x9, xzr; \ - adcs x10, x10, xzr; \ - adc x11, x11, xzr; \ - and x11, x11, #0x1ff; \ - stp x3, x4, [P0]; \ - stp x5, x6, [P0+16]; \ - stp x7, x8, [P0+32]; \ - stp x9, x10, [P0+48]; \ - str x11, [P0+64] - -// P0 = 4 * P1 - P2 = 4 * P1 + (p_521 - P2) - -#define cmsub41_p521(P0,P1,P2) \ - ldp x6, x7, [P1]; \ - lsl x3, x6, #2; \ - extr x4, x7, x6, #62; \ - ldp x8, x9, [P1+16]; \ - extr x5, x8, x7, #62; \ - extr x6, x9, x8, #62; \ - ldp x10, x11, [P1+32]; \ - extr x7, x10, x9, #62; \ - extr x8, x11, x10, #62; \ - ldp x12, x13, [P1+48]; \ - extr x9, x12, x11, #62; \ - extr x10, x13, x12, #62; \ - ldr x14, [P1+64]; \ - extr x11, x14, x13, #62; \ - ldp x0, x1, [P2]; \ - mvn x0, x0; \ - adds x3, x3, x0; \ - sbcs x4, x4, x1; \ - ldp x0, x1, [P2+16]; \ - sbcs x5, x5, x0; \ - and x15, x4, x5; \ - sbcs x6, x6, x1; \ - and x15, x15, x6; \ - ldp x0, x1, [P2+32]; \ - sbcs x7, x7, x0; \ - and x15, x15, x7; \ - sbcs x8, x8, x1; \ - and x15, x15, x8; \ - ldp x0, x1, [P2+48]; \ - sbcs x9, x9, x0; \ - and x15, x15, x9; \ - sbcs x10, x10, x1; \ - and x15, x15, x10; \ - ldr x0, [P2+64]; \ - eor x0, x0, #0x1ff; \ - adc x11, x11, x0; \ - lsr x12, x11, #9; \ - orr x11, x11, #0xfffffffffffffe00; \ - cmp xzr, xzr; \ - adcs xzr, x3, x12; \ - adcs xzr, x15, xzr; \ - adcs xzr, x11, xzr; \ - adcs x3, x3, x12; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adcs x6, x6, xzr; \ - adcs x7, x7, xzr; \ - adcs x8, x8, xzr; \ - adcs x9, x9, xzr; \ - adcs x10, x10, xzr; \ - adc x11, x11, xzr; \ - and x11, x11, #0x1ff; \ - stp x3, x4, [P0]; \ - stp x5, x6, [P0+16]; \ - stp x7, x8, [P0+32]; \ - stp x9, x10, [P0+48]; \ - str x11, [P0+64] - -S2N_BN_SYMBOL(p521_jdouble_alt): - -// Save regs and make room on stack for temporary variables - - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - stp x25, x26, [sp, #-16]! - stp x27, x28, [sp, #-16]! - sub sp, sp, NSPACE - -// Move the input arguments to stable places - - mov input_z, x0 - mov input_x, x1 - -// Main code, just a sequence of basic field operations - -// z2 = z^2 -// y2 = y^2 - - sqr_p521(z2,z_1) - sqr_p521(y2,y_1) - -// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) - - add_p521(t1,x_1,z2) - sub_p521(t2,x_1,z2) - mul_p521(x2p,t1,t2) - -// t1 = y + z -// x4p = x2p^2 -// xy2 = x * y^2 - - add_p521(t1,y_1,z_1) - sqr_p521(x4p,x2p) - weakmul_p521(xy2,x_1,y2) - -// t2 = (y + z)^2 - - sqr_p521(t2,t1) - -// d = 12 * xy2 - 9 * x4p -// t1 = y^2 + 2 * y * z - - cmsub_p521(d,12,xy2,9,x4p) - sub_p521(t1,t2,z2) - -// y4 = y^4 - - sqr_p521(y4,y2) - -// z_3' = 2 * y * z -// dx2 = d * x2p - - sub_p521(z_3,t1,y2) - weakmul_p521(dx2,d,x2p) - -// x' = 4 * xy2 - d - - cmsub41_p521(x_3,xy2,d) - -// y' = 3 * dx2 - 8 * y4 - - cmsub38_p521(y_3,dx2,y4) - -// Restore stack and registers - - add sp, sp, NSPACE - - ldp x27, x28, [sp], 16 - ldp x25, x26, [sp], 16 - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits -#endif diff --git a/third_party/s2n-bignum/arm/p521/p521_jmixadd_alt.S b/third_party/s2n-bignum/arm/p521/p521_jmixadd_alt.S deleted file mode 100644 index 783ca28cf87..00000000000 --- a/third_party/s2n-bignum/arm/p521/p521_jmixadd_alt.S +++ /dev/null @@ -1,882 +0,0 @@ -// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 - -// ---------------------------------------------------------------------------- -// Point mixed addition on NIST curve P-521 in Jacobian coordinates -// -// extern void p521_jmixadd_alt -// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]); -// -// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. -// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). -// The "mixed" part means that p2 only has x and y coordinates, with the -// implicit z coordinate assumed to be the identity. It is assumed that -// all the coordinates of the input points p1 and p2 are fully reduced -// mod p_521, that the z coordinate of p1 is nonzero and that neither -// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine -// point as". -// -// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 -// ---------------------------------------------------------------------------- -#include "_internal_s2n_bignum.h" - - S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jmixadd_alt) - S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jmixadd_alt) - - .text - .balign 4 - -// Size of individual field elements - -#define NUMSIZE 72 - -// Stable homes for input arguments during main code sequence - -#define input_z x26 -#define input_x x27 -#define input_y x28 - -// Pointer-offset pairs for inputs and outputs - -#define x_1 input_x, #0 -#define y_1 input_x, #NUMSIZE -#define z_1 input_x, #(2*NUMSIZE) - -#define x_2 input_y, #0 -#define y_2 input_y, #NUMSIZE - -#define x_3 input_z, #0 -#define y_3 input_z, #NUMSIZE -#define z_3 input_z, #(2*NUMSIZE) - -// Pointer-offset pairs for temporaries, with some aliasing -// NSPACE is the total stack needed for these temporaries - -#define zp2 sp, #(NUMSIZE*0) -#define ww sp, #(NUMSIZE*0) -#define resx sp, #(NUMSIZE*0) - -#define yd sp, #(NUMSIZE*1) -#define y2a sp, #(NUMSIZE*1) - -#define x2a sp, #(NUMSIZE*2) -#define zzx2 sp, #(NUMSIZE*2) - -#define zz sp, #(NUMSIZE*3) -#define t1 sp, #(NUMSIZE*3) - -#define t2 sp, #(NUMSIZE*4) -#define zzx1 sp, #(NUMSIZE*4) -#define resy sp, #(NUMSIZE*4) - -#define xd sp, #(NUMSIZE*5) -#define resz sp, #(NUMSIZE*5) - -#define NSPACE (NUMSIZE*6) - -// Corresponds exactly to bignum_mul_p521_alt - -#define mul_p521(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - mul x15, x3, x5; \ - umulh x16, x3, x5; \ - mul x14, x3, x6; \ - umulh x17, x3, x6; \ - adds x16, x16, x14; \ - ldp x7, x8, [P2+16]; \ - mul x14, x3, x7; \ - umulh x19, x3, x7; \ - adcs x17, x17, x14; \ - mul x14, x3, x8; \ - umulh x20, x3, x8; \ - adcs x19, x19, x14; \ - ldp x9, x10, [P2+32]; \ - mul x14, x3, x9; \ - umulh x21, x3, x9; \ - adcs x20, x20, x14; \ - mul x14, x3, x10; \ - umulh x22, x3, x10; \ - adcs x21, x21, x14; \ - ldp x11, x12, [P2+48]; \ - mul x14, x3, x11; \ - umulh x23, x3, x11; \ - adcs x22, x22, x14; \ - ldr x13, [P2+64]; \ - mul x14, x3, x12; \ - umulh x24, x3, x12; \ - adcs x23, x23, x14; \ - mul x14, x3, x13; \ - umulh x1, x3, x13; \ - adcs x24, x24, x14; \ - adc x1, x1, xzr; \ - mul x14, x4, x5; \ - adds x16, x16, x14; \ - mul x14, x4, x6; \ - adcs x17, x17, x14; \ - mul x14, x4, x7; \ - adcs x19, x19, x14; \ - mul x14, x4, x8; \ - adcs x20, x20, x14; \ - mul x14, x4, x9; \ - adcs x21, x21, x14; \ - mul x14, x4, x10; \ - adcs x22, x22, x14; \ - mul x14, x4, x11; \ - adcs x23, x23, x14; \ - mul x14, x4, x12; \ - adcs x24, x24, x14; \ - mul x14, x4, x13; \ - adcs x1, x1, x14; \ - cset x0, hs; \ - umulh x14, x4, x5; \ - adds x17, x17, x14; \ - umulh x14, x4, x6; \ - adcs x19, x19, x14; \ - umulh x14, x4, x7; \ - adcs x20, x20, x14; \ - umulh x14, x4, x8; \ - adcs x21, x21, x14; \ - umulh x14, x4, x9; \ - adcs x22, x22, x14; \ - umulh x14, x4, x10; \ - adcs x23, x23, x14; \ - umulh x14, x4, x11; \ - adcs x24, x24, x14; \ - umulh x14, x4, x12; \ - adcs x1, x1, x14; \ - umulh x14, x4, x13; \ - adc x0, x0, x14; \ - stp x15, x16, [P0]; \ - ldp x3, x4, [P1+16]; \ - mul x14, x3, x5; \ - adds x17, x17, x14; \ - mul x14, x3, x6; \ - adcs x19, x19, x14; \ - mul x14, x3, x7; \ - adcs x20, x20, x14; \ - mul x14, x3, x8; \ - adcs x21, x21, x14; \ - mul x14, x3, x9; \ - adcs x22, x22, x14; \ - mul x14, x3, x10; \ - adcs x23, x23, x14; \ - mul x14, x3, x11; \ - adcs x24, x24, x14; \ - mul x14, x3, x12; \ - adcs x1, x1, x14; \ - mul x14, x3, x13; \ - adcs x0, x0, x14; \ - cset x15, hs; \ - umulh x14, x3, x5; \ - adds x19, x19, x14; \ - umulh x14, x3, x6; \ - adcs x20, x20, x14; \ - umulh x14, x3, x7; \ - adcs x21, x21, x14; \ - umulh x14, x3, x8; \ - adcs x22, x22, x14; \ - umulh x14, x3, x9; \ - adcs x23, x23, x14; \ - umulh x14, x3, x10; \ - adcs x24, x24, x14; \ - umulh x14, x3, x11; \ - adcs x1, x1, x14; \ - umulh x14, x3, x12; \ - adcs x0, x0, x14; \ - umulh x14, x3, x13; \ - adc x15, x15, x14; \ - mul x14, x4, x5; \ - adds x19, x19, x14; \ - mul x14, x4, x6; \ - adcs x20, x20, x14; \ - mul x14, x4, x7; \ - adcs x21, x21, x14; \ - mul x14, x4, x8; \ - adcs x22, x22, x14; \ - mul x14, x4, x9; \ - adcs x23, x23, x14; \ - mul x14, x4, x10; \ - adcs x24, x24, x14; \ - mul x14, x4, x11; \ - adcs x1, x1, x14; \ - mul x14, x4, x12; \ - adcs x0, x0, x14; \ - mul x14, x4, x13; \ - adcs x15, x15, x14; \ - cset x16, hs; \ - umulh x14, x4, x5; \ - adds x20, x20, x14; \ - umulh x14, x4, x6; \ - adcs x21, x21, x14; \ - umulh x14, x4, x7; \ - adcs x22, x22, x14; \ - umulh x14, x4, x8; \ - adcs x23, x23, x14; \ - umulh x14, x4, x9; \ - adcs x24, x24, x14; \ - umulh x14, x4, x10; \ - adcs x1, x1, x14; \ - umulh x14, x4, x11; \ - adcs x0, x0, x14; \ - umulh x14, x4, x12; \ - adcs x15, x15, x14; \ - umulh x14, x4, x13; \ - adc x16, x16, x14; \ - stp x17, x19, [P0+16]; \ - ldp x3, x4, [P1+32]; \ - mul x14, x3, x5; \ - adds x20, x20, x14; \ - mul x14, x3, x6; \ - adcs x21, x21, x14; \ - mul x14, x3, x7; \ - adcs x22, x22, x14; \ - mul x14, x3, x8; \ - adcs x23, x23, x14; \ - mul x14, x3, x9; \ - adcs x24, x24, x14; \ - mul x14, x3, x10; \ - adcs x1, x1, x14; \ - mul x14, x3, x11; \ - adcs x0, x0, x14; \ - mul x14, x3, x12; \ - adcs x15, x15, x14; \ - mul x14, x3, x13; \ - adcs x16, x16, x14; \ - cset x17, hs; \ - umulh x14, x3, x5; \ - adds x21, x21, x14; \ - umulh x14, x3, x6; \ - adcs x22, x22, x14; \ - umulh x14, x3, x7; \ - adcs x23, x23, x14; \ - umulh x14, x3, x8; \ - adcs x24, x24, x14; \ - umulh x14, x3, x9; \ - adcs x1, x1, x14; \ - umulh x14, x3, x10; \ - adcs x0, x0, x14; \ - umulh x14, x3, x11; \ - adcs x15, x15, x14; \ - umulh x14, x3, x12; \ - adcs x16, x16, x14; \ - umulh x14, x3, x13; \ - adc x17, x17, x14; \ - mul x14, x4, x5; \ - adds x21, x21, x14; \ - mul x14, x4, x6; \ - adcs x22, x22, x14; \ - mul x14, x4, x7; \ - adcs x23, x23, x14; \ - mul x14, x4, x8; \ - adcs x24, x24, x14; \ - mul x14, x4, x9; \ - adcs x1, x1, x14; \ - mul x14, x4, x10; \ - adcs x0, x0, x14; \ - mul x14, x4, x11; \ - adcs x15, x15, x14; \ - mul x14, x4, x12; \ - adcs x16, x16, x14; \ - mul x14, x4, x13; \ - adcs x17, x17, x14; \ - cset x19, hs; \ - umulh x14, x4, x5; \ - adds x22, x22, x14; \ - umulh x14, x4, x6; \ - adcs x23, x23, x14; \ - umulh x14, x4, x7; \ - adcs x24, x24, x14; \ - umulh x14, x4, x8; \ - adcs x1, x1, x14; \ - umulh x14, x4, x9; \ - adcs x0, x0, x14; \ - umulh x14, x4, x10; \ - adcs x15, x15, x14; \ - umulh x14, x4, x11; \ - adcs x16, x16, x14; \ - umulh x14, x4, x12; \ - adcs x17, x17, x14; \ - umulh x14, x4, x13; \ - adc x19, x19, x14; \ - stp x20, x21, [P0+32]; \ - ldp x3, x4, [P1+48]; \ - mul x14, x3, x5; \ - adds x22, x22, x14; \ - mul x14, x3, x6; \ - adcs x23, x23, x14; \ - mul x14, x3, x7; \ - adcs x24, x24, x14; \ - mul x14, x3, x8; \ - adcs x1, x1, x14; \ - mul x14, x3, x9; \ - adcs x0, x0, x14; \ - mul x14, x3, x10; \ - adcs x15, x15, x14; \ - mul x14, x3, x11; \ - adcs x16, x16, x14; \ - mul x14, x3, x12; \ - adcs x17, x17, x14; \ - mul x14, x3, x13; \ - adcs x19, x19, x14; \ - cset x20, hs; \ - umulh x14, x3, x5; \ - adds x23, x23, x14; \ - umulh x14, x3, x6; \ - adcs x24, x24, x14; \ - umulh x14, x3, x7; \ - adcs x1, x1, x14; \ - umulh x14, x3, x8; \ - adcs x0, x0, x14; \ - umulh x14, x3, x9; \ - adcs x15, x15, x14; \ - umulh x14, x3, x10; \ - adcs x16, x16, x14; \ - umulh x14, x3, x11; \ - adcs x17, x17, x14; \ - umulh x14, x3, x12; \ - adcs x19, x19, x14; \ - umulh x14, x3, x13; \ - adc x20, x20, x14; \ - mul x14, x4, x5; \ - adds x23, x23, x14; \ - mul x14, x4, x6; \ - adcs x24, x24, x14; \ - mul x14, x4, x7; \ - adcs x1, x1, x14; \ - mul x14, x4, x8; \ - adcs x0, x0, x14; \ - mul x14, x4, x9; \ - adcs x15, x15, x14; \ - mul x14, x4, x10; \ - adcs x16, x16, x14; \ - mul x14, x4, x11; \ - adcs x17, x17, x14; \ - mul x14, x4, x12; \ - adcs x19, x19, x14; \ - mul x14, x4, x13; \ - adcs x20, x20, x14; \ - cset x21, hs; \ - umulh x14, x4, x5; \ - adds x24, x24, x14; \ - umulh x14, x4, x6; \ - adcs x1, x1, x14; \ - umulh x14, x4, x7; \ - adcs x0, x0, x14; \ - umulh x14, x4, x8; \ - adcs x15, x15, x14; \ - umulh x14, x4, x9; \ - adcs x16, x16, x14; \ - umulh x14, x4, x10; \ - adcs x17, x17, x14; \ - umulh x14, x4, x11; \ - adcs x19, x19, x14; \ - umulh x14, x4, x12; \ - adcs x20, x20, x14; \ - umulh x14, x4, x13; \ - adc x21, x21, x14; \ - stp x22, x23, [P0+48]; \ - ldr x3, [P1+64]; \ - mul x14, x3, x5; \ - adds x24, x24, x14; \ - mul x14, x3, x6; \ - adcs x1, x1, x14; \ - mul x14, x3, x7; \ - adcs x0, x0, x14; \ - mul x14, x3, x8; \ - adcs x15, x15, x14; \ - mul x14, x3, x9; \ - adcs x16, x16, x14; \ - mul x14, x3, x10; \ - adcs x17, x17, x14; \ - mul x14, x3, x11; \ - adcs x19, x19, x14; \ - mul x14, x3, x12; \ - adcs x20, x20, x14; \ - mul x14, x3, x13; \ - adc x21, x21, x14; \ - umulh x14, x3, x5; \ - adds x1, x1, x14; \ - umulh x14, x3, x6; \ - adcs x0, x0, x14; \ - umulh x14, x3, x7; \ - adcs x15, x15, x14; \ - umulh x14, x3, x8; \ - adcs x16, x16, x14; \ - umulh x14, x3, x9; \ - adcs x17, x17, x14; \ - umulh x14, x3, x10; \ - adcs x19, x19, x14; \ - umulh x14, x3, x11; \ - adcs x20, x20, x14; \ - umulh x14, x3, x12; \ - adc x21, x21, x14; \ - cmp xzr, xzr; \ - ldp x5, x6, [P0]; \ - extr x14, x1, x24, #9; \ - adcs x5, x5, x14; \ - extr x14, x0, x1, #9; \ - adcs x6, x6, x14; \ - ldp x7, x8, [P0+16]; \ - extr x14, x15, x0, #9; \ - adcs x7, x7, x14; \ - extr x14, x16, x15, #9; \ - adcs x8, x8, x14; \ - ldp x9, x10, [P0+32]; \ - extr x14, x17, x16, #9; \ - adcs x9, x9, x14; \ - extr x14, x19, x17, #9; \ - adcs x10, x10, x14; \ - ldp x11, x12, [P0+48]; \ - extr x14, x20, x19, #9; \ - adcs x11, x11, x14; \ - extr x14, x21, x20, #9; \ - adcs x12, x12, x14; \ - orr x13, x24, #0xfffffffffffffe00; \ - lsr x14, x21, #9; \ - adcs x13, x13, x14; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - -// Corresponds exactly to bignum_sqr_p521_alt - -#define sqr_p521(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x11, x2, x3; \ - umulh x12, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x10, x2, x4; \ - umulh x13, x2, x4; \ - adds x12, x12, x10; \ - ldp x6, x7, [P1+32]; \ - mul x10, x2, x5; \ - umulh x14, x2, x5; \ - adcs x13, x13, x10; \ - ldp x8, x9, [P1+48]; \ - mul x10, x2, x6; \ - umulh x15, x2, x6; \ - adcs x14, x14, x10; \ - mul x10, x2, x7; \ - umulh x16, x2, x7; \ - adcs x15, x15, x10; \ - mul x10, x2, x8; \ - umulh x17, x2, x8; \ - adcs x16, x16, x10; \ - mul x10, x2, x9; \ - umulh x19, x2, x9; \ - adcs x17, x17, x10; \ - adc x19, x19, xzr; \ - mul x10, x3, x4; \ - adds x13, x13, x10; \ - mul x10, x3, x5; \ - adcs x14, x14, x10; \ - mul x10, x3, x6; \ - adcs x15, x15, x10; \ - mul x10, x3, x7; \ - adcs x16, x16, x10; \ - mul x10, x3, x8; \ - adcs x17, x17, x10; \ - mul x10, x3, x9; \ - adcs x19, x19, x10; \ - cset x20, hs; \ - umulh x10, x3, x4; \ - adds x14, x14, x10; \ - umulh x10, x3, x5; \ - adcs x15, x15, x10; \ - umulh x10, x3, x6; \ - adcs x16, x16, x10; \ - umulh x10, x3, x7; \ - adcs x17, x17, x10; \ - umulh x10, x3, x8; \ - adcs x19, x19, x10; \ - umulh x10, x3, x9; \ - adc x20, x20, x10; \ - mul x10, x6, x7; \ - umulh x21, x6, x7; \ - adds x20, x20, x10; \ - adc x21, x21, xzr; \ - mul x10, x4, x5; \ - adds x15, x15, x10; \ - mul x10, x4, x6; \ - adcs x16, x16, x10; \ - mul x10, x4, x7; \ - adcs x17, x17, x10; \ - mul x10, x4, x8; \ - adcs x19, x19, x10; \ - mul x10, x4, x9; \ - adcs x20, x20, x10; \ - mul x10, x6, x8; \ - adcs x21, x21, x10; \ - cset x22, hs; \ - umulh x10, x4, x5; \ - adds x16, x16, x10; \ - umulh x10, x4, x6; \ - adcs x17, x17, x10; \ - umulh x10, x4, x7; \ - adcs x19, x19, x10; \ - umulh x10, x4, x8; \ - adcs x20, x20, x10; \ - umulh x10, x4, x9; \ - adcs x21, x21, x10; \ - umulh x10, x6, x8; \ - adc x22, x22, x10; \ - mul x10, x7, x8; \ - umulh x23, x7, x8; \ - adds x22, x22, x10; \ - adc x23, x23, xzr; \ - mul x10, x5, x6; \ - adds x17, x17, x10; \ - mul x10, x5, x7; \ - adcs x19, x19, x10; \ - mul x10, x5, x8; \ - adcs x20, x20, x10; \ - mul x10, x5, x9; \ - adcs x21, x21, x10; \ - mul x10, x6, x9; \ - adcs x22, x22, x10; \ - mul x10, x7, x9; \ - adcs x23, x23, x10; \ - cset x24, hs; \ - umulh x10, x5, x6; \ - adds x19, x19, x10; \ - umulh x10, x5, x7; \ - adcs x20, x20, x10; \ - umulh x10, x5, x8; \ - adcs x21, x21, x10; \ - umulh x10, x5, x9; \ - adcs x22, x22, x10; \ - umulh x10, x6, x9; \ - adcs x23, x23, x10; \ - umulh x10, x7, x9; \ - adc x24, x24, x10; \ - mul x10, x8, x9; \ - umulh x25, x8, x9; \ - adds x24, x24, x10; \ - adc x25, x25, xzr; \ - adds x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - adcs x15, x15, x15; \ - adcs x16, x16, x16; \ - adcs x17, x17, x17; \ - adcs x19, x19, x19; \ - adcs x20, x20, x20; \ - adcs x21, x21, x21; \ - adcs x22, x22, x22; \ - adcs x23, x23, x23; \ - adcs x24, x24, x24; \ - adcs x25, x25, x25; \ - cset x0, hs; \ - umulh x10, x2, x2; \ - adds x11, x11, x10; \ - mul x10, x3, x3; \ - adcs x12, x12, x10; \ - umulh x10, x3, x3; \ - adcs x13, x13, x10; \ - mul x10, x4, x4; \ - adcs x14, x14, x10; \ - umulh x10, x4, x4; \ - adcs x15, x15, x10; \ - mul x10, x5, x5; \ - adcs x16, x16, x10; \ - umulh x10, x5, x5; \ - adcs x17, x17, x10; \ - mul x10, x6, x6; \ - adcs x19, x19, x10; \ - umulh x10, x6, x6; \ - adcs x20, x20, x10; \ - mul x10, x7, x7; \ - adcs x21, x21, x10; \ - umulh x10, x7, x7; \ - adcs x22, x22, x10; \ - mul x10, x8, x8; \ - adcs x23, x23, x10; \ - umulh x10, x8, x8; \ - adcs x24, x24, x10; \ - mul x10, x9, x9; \ - adcs x25, x25, x10; \ - umulh x10, x9, x9; \ - adc x0, x0, x10; \ - ldr x1, [P1+64]; \ - add x1, x1, x1; \ - mul x10, x1, x2; \ - adds x19, x19, x10; \ - umulh x10, x1, x2; \ - adcs x20, x20, x10; \ - mul x10, x1, x4; \ - adcs x21, x21, x10; \ - umulh x10, x1, x4; \ - adcs x22, x22, x10; \ - mul x10, x1, x6; \ - adcs x23, x23, x10; \ - umulh x10, x1, x6; \ - adcs x24, x24, x10; \ - mul x10, x1, x8; \ - adcs x25, x25, x10; \ - umulh x10, x1, x8; \ - adcs x0, x0, x10; \ - lsr x4, x1, #1; \ - mul x4, x4, x4; \ - adc x4, x4, xzr; \ - mul x10, x1, x3; \ - adds x20, x20, x10; \ - umulh x10, x1, x3; \ - adcs x21, x21, x10; \ - mul x10, x1, x5; \ - adcs x22, x22, x10; \ - umulh x10, x1, x5; \ - adcs x23, x23, x10; \ - mul x10, x1, x7; \ - adcs x24, x24, x10; \ - umulh x10, x1, x7; \ - adcs x25, x25, x10; \ - mul x10, x1, x9; \ - adcs x0, x0, x10; \ - umulh x10, x1, x9; \ - adc x4, x4, x10; \ - mul x2, x2, x2; \ - cmp xzr, xzr; \ - extr x10, x20, x19, #9; \ - adcs x2, x2, x10; \ - extr x10, x21, x20, #9; \ - adcs x11, x11, x10; \ - extr x10, x22, x21, #9; \ - adcs x12, x12, x10; \ - extr x10, x23, x22, #9; \ - adcs x13, x13, x10; \ - extr x10, x24, x23, #9; \ - adcs x14, x14, x10; \ - extr x10, x25, x24, #9; \ - adcs x15, x15, x10; \ - extr x10, x0, x25, #9; \ - adcs x16, x16, x10; \ - extr x10, x4, x0, #9; \ - adcs x17, x17, x10; \ - orr x19, x19, #0xfffffffffffffe00; \ - lsr x10, x4, #9; \ - adcs x19, x19, x10; \ - sbcs x2, x2, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbcs x15, x15, xzr; \ - sbcs x16, x16, xzr; \ - sbcs x17, x17, xzr; \ - sbc x19, x19, xzr; \ - and x19, x19, #0x1ff; \ - stp x2, x11, [P0]; \ - stp x12, x13, [P0+16]; \ - stp x14, x15, [P0+32]; \ - stp x16, x17, [P0+48]; \ - str x19, [P0+64] - -// Corresponds exactly to bignum_sub_p521 - -#define sub_p521(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - ldp x11, x12, [P1+48]; \ - ldp x4, x3, [P2+48]; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - ldr x13, [P1+64]; \ - ldr x4, [P2+64]; \ - sbcs x13, x13, x4; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ - str x13, [P0+64] - -S2N_BN_SYMBOL(p521_jmixadd_alt): - -// Save regs and make room on stack for temporary variables - - stp x19, x20, [sp, #-16]! - stp x21, x22, [sp, #-16]! - stp x23, x24, [sp, #-16]! - stp x25, x26, [sp, #-16]! - stp x27, x28, [sp, #-16]! - sub sp, sp, NSPACE - -// Move the input arguments to stable places - - mov input_z, x0 - mov input_x, x1 - mov input_y, x2 - -// Main code, just a sequence of basic field operations - - sqr_p521(zp2,z_1) - mul_p521(y2a,z_1,y_2) - - mul_p521(x2a,zp2,x_2) - mul_p521(y2a,zp2,y2a) - - sub_p521(xd,x2a,x_1) - sub_p521(yd,y2a,y_1) - - sqr_p521(zz,xd) - sqr_p521(ww,yd) - - mul_p521(zzx1,zz,x_1) - mul_p521(zzx2,zz,x2a) - - sub_p521(resx,ww,zzx1) - sub_p521(t1,zzx2,zzx1) - - mul_p521(resz,xd,z_1) - - sub_p521(resx,resx,zzx2) - - sub_p521(t2,zzx1,resx) - - mul_p521(t1,t1,y_1) - mul_p521(t2,yd,t2) - - sub_p521(resy,t2,t1) - -// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) - - ldp x0, x1, [z_1] - orr x0, x0, x1 - ldp x2, x3, [z_1+16] - orr x2, x2, x3 - ldp x4, x5, [z_1+32] - orr x4, x4, x5 - ldp x6, x7, [z_1+48] - orr x6, x6, x7 - ldr x8, [z_1+64] - orr x0, x0, x2 - orr x4, x4, x6 - orr x0, x0, x4 - orr x0, x0, x8 - cmp x0, xzr - -// Multiplex: if p1 <> 0 just copy the computed result from the staging area. -// If p1 = 0 then return the point p2 augmented with an extra z = 1 -// coordinate, hence giving 0 + p2 = p2 for the final result. - - ldp x0, x1, [resx] - ldp x20, x21, [x_2] - csel x0, x0, x20, ne - csel x1, x1, x21, ne - ldp x2, x3, [resx+16] - ldp x20, x21, [x_2+16] - csel x2, x2, x20, ne - csel x3, x3, x21, ne - ldp x4, x5, [resx+32] - ldp x20, x21, [x_2+32] - csel x4, x4, x20, ne - csel x5, x5, x21, ne - ldp x6, x7, [resx+48] - ldp x20, x21, [x_2+48] - csel x6, x6, x20, ne - csel x7, x7, x21, ne - ldr x8, [resx+64] - ldr x20, [x_2+64] - csel x8, x8, x20, ne - - ldp x10, x11, [resy] - ldp x20, x21, [y_2] - csel x10, x10, x20, ne - csel x11, x11, x21, ne - ldp x12, x13, [resy+16] - ldp x20, x21, [y_2+16] - csel x12, x12, x20, ne - csel x13, x13, x21, ne - ldp x14, x15, [resy+32] - ldp x20, x21, [y_2+32] - csel x14, x14, x20, ne - csel x15, x15, x21, ne - ldp x16, x17, [resy+48] - ldp x20, x21, [y_2+48] - csel x16, x16, x20, ne - csel x17, x17, x21, ne - ldr x19, [resy+64] - ldr x20, [y_2+64] - csel x19, x19, x20, ne - - stp x0, x1, [x_3] - stp x2, x3, [x_3+16] - stp x4, x5, [x_3+32] - stp x6, x7, [x_3+48] - str x8, [x_3+64] - stp x10, x11, [y_3] - stp x12, x13, [y_3+16] - stp x14, x15, [y_3+32] - stp x16, x17, [y_3+48] - str x19, [y_3+64] - - ldp x0, x1, [resz] - mov x20, #1 - csel x0, x0, x20, ne - csel x1, x1, xzr, ne - ldp x2, x3, [resz+16] - csel x2, x2, xzr, ne - csel x3, x3, xzr, ne - ldp x4, x5, [resz+32] - csel x4, x4, xzr, ne - csel x5, x5, xzr, ne - ldp x6, x7, [resz+48] - csel x6, x6, xzr, ne - csel x7, x7, xzr, ne - ldr x8, [resz+64] - csel x8, x8, xzr, ne - - stp x0, x1, [z_3] - stp x2, x3, [z_3+16] - stp x4, x5, [z_3+32] - stp x6, x7, [z_3+48] - str x8, [z_3+64] - -// Restore stack and registers - - add sp, sp, NSPACE - - ldp x27, x28, [sp], 16 - ldp x25, x26, [sp], 16 - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - - ret - -#if defined(__linux__) && defined(__ELF__) -.section .note.GNU-stack, "", %progbits -#endif diff --git a/third_party/s2n-bignum/import.sh b/third_party/s2n-bignum/import.sh new file mode 100755 index 00000000000..4a2bb1b638d --- /dev/null +++ b/third_party/s2n-bignum/import.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC + +# https://github.com/awslabs/s2n-bignum -> AWS-LC importer script +# +# This script imports a version of s2n-bignum source into AWS-LC. +# +# Usage: +# +# ``` +# rm -rf ./s2n-bignum-imported +# ./import.sh +# ``` +# +# This imports s2n-bignum from https://github.com/awslabs/s2n-bignum +# and leaves import meta data in META.yml. +# +# If you want to import a specific branch/tag or from a specific repository +# either set GITHUB_TARGET or GITHUB_REPOSITORY as below: +# +# ``` +# GITHUB_REPOSITORY=/ GITHUB_TARGET= ./import.sh +# ``` + +GITHUB_SERVER_URL="https://github.com/" +GITHUB_REPOSITORY=${GITHUB_REPOSITORY:=awslabs/s2n-bignum.git} +GITHUB_TARGET=${GITHUB_TARGET:=main} + +SRC="s2n-bignum-imported" +TMP="TEMP_CAN_DELETE" + +# Check if TMP directory already exists +if [ -d "${TMP}" ]; then + echo "Source directory or symlink ${TMP} does already exist -- please remove it before re-running the importer" + exit 1 +fi + +# Check if source directory already exists +if [ -d "${SRC}" ]; then + echo "Source directory or symlink ${SRC} does already exist -- please remove it before re-running the importer" + exit 1 +fi + +mkdir ${TMP} + +echo "Fetching repository ..." +git clone ${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY} ${TMP} --branch ${GITHUB_TARGET} --single-branch > /dev/null +GITHUB_COMMIT=$(cd ${TMP} >/dev/null; git rev-parse HEAD) + +echo "Cloned s2n-bignum folder" +ls -la ${TMP} + +echo "Remove source code from s2n-bignum that is not needed..." +code_not_needed=("benchmarks" "codebuild" "common" "tests" "tools" "x86" "arm/proofs") +for code in "${code_not_needed[@]}"; do + rm -rf ${TMP}/${code} +done + +echo "Cloned s2n-bignum folder after removing unneeded source code..." +ls -la ${TMP} + +echo "Copy source code ..." +mkdir ${SRC} +cp -rH ${TMP}/* ${SRC} + +echo "Copied s2n-bignum source code..." +ls -la ${SRC} + +echo "Remove temporary artifacts ..." +rm -rf ${TMP} + +echo "Generating META.yml file ..." +cat < META.yml +name: ${SRC} +source: ${GITHUB_REPOSITORY} +commit: ${GITHUB_COMMIT} +target: ${GITHUB_TARGET} +imported-at: $(env TZ=UTC date "+%Y-%m-%dT%H:%M:%S%z") +EOF + +# Submodule path might be cached. +echo "" +echo "Post actions: Run" +echo "$ git add ${SRC} META.yml ; git commit -m \"Imported s2n-bignum version: ${GITHUB_TARGET}/${GITHUB_COMMIT}\"" +echo "to add new source to git tree" diff --git a/third_party/s2n-bignum/include/_internal_s2n_bignum.h b/third_party/s2n-bignum/include/_internal_s2n_bignum.h deleted file mode 100644 index c7cedb633a4..00000000000 --- a/third_party/s2n-bignum/include/_internal_s2n_bignum.h +++ /dev/null @@ -1,17 +0,0 @@ - -#ifdef __APPLE__ -# define S2N_BN_SYMBOL(NAME) _##NAME -#else -# define S2N_BN_SYMBOL(name) name -#endif - -#define S2N_BN_SYM_VISIBILITY_DIRECTIVE(name) .globl S2N_BN_SYMBOL(name) -#ifdef S2N_BN_HIDE_SYMBOLS -# ifdef __APPLE__ -# define S2N_BN_SYM_PRIVACY_DIRECTIVE(name) .private_extern S2N_BN_SYMBOL(name) -# else -# define S2N_BN_SYM_PRIVACY_DIRECTIVE(name) .hidden S2N_BN_SYMBOL(name) -# endif -#else -# define S2N_BN_SYM_PRIVACY_DIRECTIVE(name) /* NO-OP: S2N_BN_SYM_PRIVACY_DIRECTIVE */ -#endif \ No newline at end of file diff --git a/third_party/s2n-bignum/include/s2n-bignum_aws-lc.h b/third_party/s2n-bignum/include/s2n-bignum_aws-lc.h deleted file mode 100644 index 186029bf08f..00000000000 --- a/third_party/s2n-bignum/include/s2n-bignum_aws-lc.h +++ /dev/null @@ -1,433 +0,0 @@ -/* - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"). - * You may not use this file except in compliance with the License. - * A copy of the License is located at - * - * http://aws.amazon.com/apache2.0 - * - * or in the "LICENSE" file accompanying this file. This file is distributed - * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either - * express or implied. See the License for the specific language governing - * permissions and limitations under the License. - */ -#ifndef S2N_BIGNUM_AWS_LC_H -#define S2N_BIGNUM_AWS_LC_H - -#if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(__STDC_NO_VLA__) - #define S2N_BIGNUM_STATIC -#else - #define S2N_BIGNUM_STATIC static -#endif - -// ---------------------------------------------------------------------------- -// C prototypes for s2n-bignum functions used in AWS-LC -// ---------------------------------------------------------------------------- - -// For some functions there are additional variants with names ending in -// "_alt". These have the same core mathematical functionality as their -// non-"alt" versions, but can be better suited to some microarchitectures: -// -// - On x86, the "_alt" forms avoid BMI and ADX instruction set -// extensions, so will run on any x86_64 machine, even older ones -// -// - On ARM, the "_alt" forms target machines with higher multiplier -// throughput, generally offering higher performance there. -// For each of those, we define a _selector function that selects, in runtime, -// the _alt or non-_alt version to run. - -#if defined(OPENSSL_X86_64) -// On x86_64 platforms s2n-bignum uses bmi2 and adx instruction sets -// for some of the functions. These instructions are not supported by -// every x86 CPU so we have to check if they are available and in case -// they are not we fallback to slightly slower but generic implementation. -static inline uint8_t use_s2n_bignum_alt(void) { - return (!CRYPTO_is_BMI2_capable() || !CRYPTO_is_ADX_capable()); -} -#else -// On aarch64 platforms s2n-bignum has two implementations of certain -// functions -- the default one and the alternative (suffixed _alt). -// Depending on the architecture one version is faster than the other. -// Generally, the "_alt" functions are faster on architectures with higher -// multiplier throughput, for example, Graviton 3, Apple's M1 and iPhone chips. -static inline uint8_t use_s2n_bignum_alt(void) { - return CRYPTO_is_ARMv8_wide_multiplier_capable(); -} -#endif - -extern void p256_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]); -extern void p256_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]); -static inline void p256_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]) { - if (use_s2n_bignum_alt()) { p256_montjscalarmul_alt(res, scalar, point); } - else { p256_montjscalarmul(res, scalar, point); } -} - -// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 -// z = x^-1 mod p_256. -// The function is constant-time. -extern void bignum_montinv_p256(uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); - -// Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced -// Inputs x[6], y[6]; output z[6] -extern void bignum_add_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]); - -// Convert from almost-Montgomery form, z := (x / 2^384) mod p_384 -// Input x[6]; output z[6] -extern void bignum_deamont_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); -extern void bignum_deamont_p384_alt(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); -static inline void bignum_deamont_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]) { - if (use_s2n_bignum_alt()) { bignum_deamont_p384_alt(z, x); } - else { bignum_deamont_p384(z, x); } -} - -// Montgomery multiply, z := (x * y / 2^384) mod p_384 -// Inputs x[6], y[6]; output z[6] -extern void bignum_montmul_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]); -extern void bignum_montmul_p384_alt(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]); -static inline void bignum_montmul_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]) { - if (use_s2n_bignum_alt()) { bignum_montmul_p384_alt(z, x, y); } - else { bignum_montmul_p384(z, x, y); } -} - -// Montgomery square, z := (x^2 / 2^384) mod p_384 -// Input x[6]; output z[6] -extern void bignum_montsqr_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); -extern void bignum_montsqr_p384_alt(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); -static inline void bignum_montsqr_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]) { - if (use_s2n_bignum_alt()) { bignum_montsqr_p384_alt(z, x); } - else { bignum_montsqr_p384(z, x); } -} - -// Negate modulo p_384, z := (-x) mod p_384, assuming x reduced -// Input x[6]; output z[6] -extern void bignum_neg_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); - -// Subtract modulo p_384, z := (x - y) mod p_384 -// Inputs x[6], y[6]; output z[6] -extern void bignum_sub_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]); - -// Convert to Montgomery form z := (2^384 * x) mod p_384 */ -// Input x[6]; output z[6] */ -extern void bignum_tomont_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); -extern void bignum_tomont_p384_alt(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); -static inline void bignum_tomont_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]) { - if (use_s2n_bignum_alt()) { bignum_tomont_p384_alt(z, x); } - else { bignum_tomont_p384(z, x); } -} -extern void p384_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 18],uint64_t p1[S2N_BIGNUM_STATIC 18]); -extern void p384_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],uint64_t p1[S2N_BIGNUM_STATIC 18]); -static inline void p384_montjdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 18],uint64_t p1[S2N_BIGNUM_STATIC 18]) { - if (use_s2n_bignum_alt()) { p384_montjdouble_alt(p3, p1); } - else { p384_montjdouble(p3, p1); } -} - -extern void p384_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]); -extern void p384_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]); -static inline void p384_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]) { - if (use_s2n_bignum_alt()) { p384_montjscalarmul_alt(res, scalar, point); } - else { p384_montjscalarmul(res, scalar, point); } -} - -// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 -// z = x^-1 mod p_384. -// The function is constant-time. -extern void bignum_montinv_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); - -// Convert 6-digit (384-bit) bignum from little-endian form -// Input x[6]; output z[6] -extern void bignum_fromlebytes_6(uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]); - -// Convert 6-digit (384-bit) bignum to little-endian form -// Input x[6]; output z[6] -extern void bignum_tolebytes_6(uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]); - -// 384-bit nonzeroness test, returning 1 if x is nonzero, 0 if x is zero -// Input x[6]; output function return -extern uint64_t bignum_nonzero_6(const uint64_t x[S2N_BIGNUM_STATIC 6]); - -// Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced -// Inputs x[9], y[9]; output z[9] -extern void bignum_add_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]); - -// Subtract modulo p_521, z := (x - y) mod p_521 -// Inputs x[9], y[9]; output z[9] -extern void bignum_sub_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]); - -// Negate modulo p_521, z := (-x) mod p_521, assuming x reduced -// Input x[9]; output z[9] -extern void bignum_neg_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); - -// Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced -// Inputs x[9], y[9]; output z[9] -extern void bignum_mul_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]); -extern void bignum_mul_p521_alt(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]); -static inline void bignum_mul_p521_selector(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]) { - if (use_s2n_bignum_alt()) { bignum_mul_p521_alt(z, x, y); } - else { bignum_mul_p521(z, x, y); } -} - -// Square modulo p_521, z := (x^2) mod p_521, assuming x reduced -// Input x[9]; output z[9] -extern void bignum_sqr_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); -extern void bignum_sqr_p521_alt(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); -static inline void bignum_sqr_p521_selector(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]) { - if (use_s2n_bignum_alt()) { bignum_sqr_p521_alt(z, x); } - else { bignum_sqr_p521(z, x); } -} - -// Convert little-endian bytes to 9-digit 528-bit bignum -extern void bignum_fromlebytes_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint8_t x[S2N_BIGNUM_STATIC 66]); - -// Convert 9-digit 528-bit bignum to little-endian bytes -extern void bignum_tolebytes_p521(uint8_t z[S2N_BIGNUM_STATIC 66], const uint64_t x[S2N_BIGNUM_STATIC 9]); - -extern void p521_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]); -extern void p521_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]); -static inline void p521_jdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]) { - if (use_s2n_bignum_alt()) { p521_jdouble_alt(p3, p1); } - else { p521_jdouble(p3, p1); } -} -extern void p521_jscalarmul(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]); -extern void p521_jscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]); -static inline void p521_jscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]) { - if (use_s2n_bignum_alt()) { p521_jscalarmul_alt(res, scalar, point); } - else { p521_jscalarmul(res, scalar, point); } -} - -// Modular inverse modulo p_521 = 2^521 - 1 -// z = x^-1 mod p_521. -// The function is constant-time. -extern void bignum_inv_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); - -// curve25519_x25519_byte and curve25519_x25519_byte_alt computes the x25519 -// function specified in https://www.rfc-editor.org/rfc/rfc7748. |scalar| is the -// scalar, |point| is the u-coordinate of the elliptic curve -// point. The result, another u-coordinate, is saved in |res|. -extern void curve25519_x25519_byte(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32], const uint8_t point[S2N_BIGNUM_STATIC 32]); -extern void curve25519_x25519_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32], const uint8_t point[S2N_BIGNUM_STATIC 32]); -static inline void curve25519_x25519_byte_selector(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32], const uint8_t point[S2N_BIGNUM_STATIC 32]) { - if (use_s2n_bignum_alt()) { curve25519_x25519_byte_alt(res, scalar, point); } - else { curve25519_x25519_byte(res, scalar, point); } -} - -// curve25519_x25519base_byte and curve25519_x25519base_byte_alt computes the -// x25519 function specified in https://www.rfc-editor.org/rfc/rfc7748 using the -// basepoint specified in section 4.1. |scalar| is the scalar. The result, -// another u-coordinate, is saved in |res|. -extern void curve25519_x25519base_byte(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32]); -extern void curve25519_x25519base_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32]); -static inline void curve25519_x25519base_byte_selector(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32]) { - if (use_s2n_bignum_alt()) { curve25519_x25519base_byte_alt(res, scalar); } - else { curve25519_x25519base_byte(res, scalar); } -} - -// Evaluate z := x^2 where x is a 2048-bit integer. -// Input: x[32]; output: z[64]; temporary buffer: t[>=72] -#define S2NBIGNUM_KSQR_32_64_TEMP_NWORDS 72 -extern void -bignum_ksqr_32_64(uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], - uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KSQR_32_64_TEMP_NWORDS]); -extern void -bignum_ksqr_32_64_neon(uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], - uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KSQR_32_64_TEMP_NWORDS]); - -// Evaluate z := x^2 where x is a 1024-bit integer. -// Input: x[16]; output: z[32]; temporary buffer: t[>=24] -#define S2NBIGNUM_KSQR_16_32_TEMP_NWORDS 24 -extern void -bignum_ksqr_16_32(uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], - uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KSQR_16_32_TEMP_NWORDS]); -extern void -bignum_ksqr_16_32_neon(uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], - uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KSQR_16_32_TEMP_NWORDS]); - -// Evaluate z := x * y where x and y are 2048-bit integers. -// Inputs: x[32], y[32]; output: z[64]; temporary buffer t[>=96] -#define S2NBIGNUM_KMUL_32_64_TEMP_NWORDS 96 -extern void -bignum_kmul_32_64(uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], - const uint64_t y[S2N_BIGNUM_STATIC 32], - uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KMUL_32_64_TEMP_NWORDS]); -extern void -bignum_kmul_32_64_neon(uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], - const uint64_t y[S2N_BIGNUM_STATIC 32], - uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KMUL_32_64_TEMP_NWORDS]); - -// Evaluate z := x * y where x and y are 1024-bit integers. -// Inputs: x[16], y[16]; output: z[32]; temporary buffer t[>=32] -#define S2NBIGNUM_KMUL_16_32_TEMP_NWORDS 32 -extern void -bignum_kmul_16_32(uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], - const uint64_t y[S2N_BIGNUM_STATIC 16], - uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KMUL_16_32_TEMP_NWORDS]); -extern void -bignum_kmul_16_32_neon(uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], - const uint64_t y[S2N_BIGNUM_STATIC 16], - uint64_t t[S2N_BIGNUM_STATIC S2NBIGNUM_KMUL_16_32_TEMP_NWORDS]); - -// Extended Montgomery reduce in 8-digit blocks. -// Assumes that z initially holds a 2k-digit bignum z_0, m is a k-digit odd -// bignum and m * w == -1 (mod 2^64). This function also uses z for the output -// as well as returning a carry c of 0 or 1. This encodes two numbers: in the -// lower half of the z buffer we have q = z[0..k-1], while the upper half -// together with the carry gives r = 2^{64k}*c + z[k..2k-1]. These values -// satisfy z_0 + q * m = 2^{64k} * r, i.e. r gives a raw (unreduced) Montgomery -// reduction while q gives the multiplier that was used. -// Note that q = (z_0 mod 2^{64k}) * (-m^-1 mod 2^{64k}) mod 2^{64k}. -// z_0 + q * m = 0 mod 2^{64k} -// q * m = -z_0 mod 2^{64k} -// q = -z_0 * m^-1 mod 2^{64k} -// = (z_0 mod 2^{64k}) * (-m^-1 mod 2^{64k}) mod 2^{64k} -// q is uniquely determined because q must be in the range of [0, 2^{64k}-1]. -// Inputs: z[2*k], m[k], w; outputs: function return (extra result bit) and z[2*k] -extern uint64_t bignum_emontredc_8n(uint64_t k, uint64_t *z, const uint64_t *m, - uint64_t w); -extern uint64_t bignum_emontredc_8n_neon(uint64_t k, uint64_t *z, const uint64_t *m, - uint64_t w); - -// Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero) -// Inputs: x[k], p, y[k]; outputs: function return (carry-out) and z[k] -extern uint64_t bignum_optsub(uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, - const uint64_t *y); - -// Compare bignums, x >= y. -// Inputs: x[m], y[n]; output: function return (1 if x >= y) -extern uint64_t bignum_ge(uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y); - -// General big-integer multiplication (z := x * y). -// Inputs: x[m], y[n]; output: z[k]. If k < m+n, the result is truncated. -extern void bignum_mul(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, - uint64_t n, const uint64_t *y); - -// General big-integer squaring (z := x^2). -// Inputs: x[m]; output: z[k]. If k < 2m, the result is truncated. -extern void bignum_sqr(uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x); - -// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] -// into z[0..row-1]. -// This function is constant-time with respect to the value of `idx`. This is -// achieved by reading the whole table and using the bit-masking to get the -// `idx`-th row. -// Input table[height*width]; output z[width] -extern void bignum_copy_row_from_table (uint64_t *z, const uint64_t *table, - uint64_t height, uint64_t width, uint64_t idx); - -// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] -// into z[0..row-1]. width must be a multiple of 8. -// This function is constant-time with respect to the value of `idx`. This is -// achieved by reading the whole table and using the bit-masking to get the -// `idx`-th row. -// Input table[height*width]; output z[width] -extern void bignum_copy_row_from_table_8n_neon (uint64_t *z, const uint64_t *table, - uint64_t height, uint64_t width, uint64_t idx); - -// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] into z[0..row-1]. -// This function is constant-time with respect to the value of `idx`. This is -// achieved by reading the whole table and using the bit-masking to get the -// `idx`-th row. -// Input table[height*16]; output z[16] -extern void bignum_copy_row_from_table_16_neon (uint64_t *z, const uint64_t *table, - uint64_t height, uint64_t idx); - -// Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1] into z[0..row-1]. -// This function is constant-time with respect to the value of `idx`. This is -// achieved by reading the whole table and using the bit-masking to get the -// `idx`-th row. -// Input table[height*32]; output z[32] -extern void bignum_copy_row_from_table_32_neon (uint64_t *z, const uint64_t *table, - uint64_t height, uint64_t idx); - -// Reduction is modulo the order of the curve25519/edwards25519 basepoint, -// which is n_25519 = 2^252 + 27742317777372353535851937790883648493. -// Reduce modulo basepoint order, z := x mod n_25519 -// Input x[k]; output z[4] -extern void bignum_mod_n25519(uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, uint64_t *x); - -// Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced -// Input x[4]; output z[4] -extern void bignum_neg_p25519(uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t x[S2N_BIGNUM_STATIC 4]); - -// Performs z := (x * y + c) mod n_25519, where the modulus is -// n_25519 = 2^252 + 27742317777372353535851937790883648493, the -// order of the curve25519/edwards25519 basepoint. The result z -// and the inputs x, y and c are all 4 digits (256 bits). -// Inputs x[4], y[4], c[4]; output z[4] -extern void bignum_madd_n25519(uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t x[S2N_BIGNUM_STATIC 4], - uint64_t y[S2N_BIGNUM_STATIC 4], uint64_t c[S2N_BIGNUM_STATIC 4]); -extern void bignum_madd_n25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t x[S2N_BIGNUM_STATIC 4], - uint64_t y[S2N_BIGNUM_STATIC 4], uint64_t c[S2N_BIGNUM_STATIC 4]); -static inline void bignum_madd_n25519_selector(uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t x[S2N_BIGNUM_STATIC 4], uint64_t y[S2N_BIGNUM_STATIC 4], uint64_t c[S2N_BIGNUM_STATIC 4]) { - if (use_s2n_bignum_alt()) { bignum_madd_n25519_alt(z, x, y, c); } - else { bignum_madd_n25519(z, x, y, c); } -} - -// This assumes that the input buffer p points to a pair of 256-bit -// numbers x (at p) and y (at p+4) representing a point (x,y) on the -// edwards25519 curve. It is assumed that both x and y are < p_25519 -// but there is no checking of this, nor of the fact that (x,y) is -// in fact on the curve. -// -// The output in z is a little-endian array of bytes corresponding to -// the standard compressed encoding of a point as 2^255 * x_0 + y -// where x_0 is the least significant bit of x. -// See "https://datatracker.ietf.org/doc/html/rfc8032#section-5.1.2" -// In this implementation, y is simply truncated to 255 bits, but if -// it is reduced mod p_25519 as expected this does not affect values. -extern void edwards25519_encode(uint8_t z[S2N_BIGNUM_STATIC 32], uint64_t p[S2N_BIGNUM_STATIC 8]); - -// This interprets the input byte string as a little-endian number -// representing a point (x,y) on the edwards25519 curve, encoded as -// 2^255 * x_0 + y where x_0 is the least significant bit of x. It -// returns the full pair of coordinates x (at z) and y (at z+4). The -// return code is 0 for success and 1 for failure, which means that -// the input does not correspond to the encoding of any edwards25519 -// point. This can happen for three reasons, where y = the lowest -// 255 bits of the input: -// -// * y >= p_25519 -// Input y coordinate is not reduced -// * (y^2 - 1) * (1 + d_25519 * y^2) has no modular square root -// There is no x such that (x,y) is on the curve -// * y^2 = 1 and top bit of input is set -// Cannot be the canonical encoding of (0,1) or (0,-1) -// -// Input c[32] (bytes); output function return and z[8] -extern uint64_t edwards25519_decode(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]); -extern uint64_t edwards25519_decode_alt(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]); -static inline uint64_t edwards25519_decode_selector(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]) { - if (use_s2n_bignum_alt()) { return edwards25519_decode_alt(z, c); } - else { return edwards25519_decode(z, c); } -} - -// Given a scalar n, returns point (X,Y) = n * B where B = (...,4/5) is -// the standard basepoint for the edwards25519 (Ed25519) curve. -// Input scalar[4]; output res[8] -extern void edwards25519_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4]); -extern void edwards25519_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4]); -static inline void edwards25519_scalarmulbase_selector(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4]) { - if (use_s2n_bignum_alt()) { edwards25519_scalarmulbase_alt(res, scalar); } - else { edwards25519_scalarmulbase(res, scalar); } -} - -// Given scalar = n, point = P and bscalar = m, returns in res -// the point (X,Y) = n * P + m * B where B = (...,4/5) is -// the standard basepoint for the edwards25519 (Ed25519) curve. -// -// Both 256-bit coordinates of the input point P are implicitly -// reduced modulo 2^255-19 if they are not already in reduced form, -// but the conventional usage is that they *are* already reduced. -// The scalars can be arbitrary 256-bit numbers but may also be -// considered as implicitly reduced modulo the group order. -// -// Input scalar[4], point[8], bscalar[4]; output res[8] -extern void edwards25519_scalarmuldouble(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4], - uint64_t point[S2N_BIGNUM_STATIC 8], uint64_t bscalar[S2N_BIGNUM_STATIC 4]); -extern void edwards25519_scalarmuldouble_alt(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4], - uint64_t point[S2N_BIGNUM_STATIC 8], uint64_t bscalar[S2N_BIGNUM_STATIC 4]); -static inline void edwards25519_scalarmuldouble_selector(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 8], uint64_t bscalar[S2N_BIGNUM_STATIC 4]) { - if (use_s2n_bignum_alt()) { edwards25519_scalarmuldouble_alt(res, scalar, point, bscalar); } - else { edwards25519_scalarmuldouble(res, scalar, point, bscalar); } -} - -#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/CODE_OF_CONDUCT.md b/third_party/s2n-bignum/s2n-bignum-imported/CODE_OF_CONDUCT.md new file mode 100644 index 00000000000..5b627cfa60b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/CODE_OF_CONDUCT.md @@ -0,0 +1,4 @@ +## Code of Conduct +This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). +For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact +opensource-codeofconduct@amazon.com with any additional questions or comments. diff --git a/third_party/s2n-bignum/s2n-bignum-imported/CONTRIBUTING.md b/third_party/s2n-bignum/s2n-bignum-imported/CONTRIBUTING.md new file mode 100644 index 00000000000..c4b6a1c5081 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/CONTRIBUTING.md @@ -0,0 +1,59 @@ +# Contributing Guidelines + +Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional +documentation, we greatly value feedback and contributions from our community. + +Please read through this document before submitting any issues or pull requests to ensure we have all the necessary +information to effectively respond to your bug report or contribution. + + +## Reporting Bugs/Feature Requests + +We welcome you to use the GitHub issue tracker to report bugs or suggest features. + +When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already +reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: + +* A reproducible test case or series of steps +* The version of our code being used +* Any modifications you've made relevant to the bug +* Anything unusual about your environment or deployment + + +## Contributing via Pull Requests +Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: + +1. You are working against the latest source on the *main* branch. +2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. +3. You open an issue to discuss any significant work - we would hate for your time to be wasted. + +To send us a pull request, please: + +1. Fork the repository. +2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. +3. Ensure local tests pass. +4. Commit to your fork using clear commit messages. +5. Send us a pull request, answering any default questions in the pull request interface. +6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. + +GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and +[creating a pull request](https://help.github.com/articles/creating-a-pull-request/). + + +## Finding contributions to work on +Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. + + +## Code of Conduct +This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). +For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact +opensource-codeofconduct@amazon.com with any additional questions or comments. + + +## Security issue notifications +If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. + + +## Licensing + +See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. diff --git a/third_party/s2n-bignum/s2n-bignum-imported/LICENSE b/third_party/s2n-bignum/s2n-bignum-imported/LICENSE new file mode 100644 index 00000000000..7a5168f979d --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/LICENSE @@ -0,0 +1,222 @@ +SPDX-License-Identifier: Apache-2.0 OR ISC or MIT-0 + + +Apache 2.0 license +------------------------------------- + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + +ISC license +------------------------------------- + +Copyright Amazon.com, Inc. or its affiliates. + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + +MIT-0 license +------------------------------------- + +Copyright 2021-2024 Amazon.com, Inc. or its affiliates. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/third_party/s2n-bignum/s2n-bignum-imported/NOTICE b/third_party/s2n-bignum/s2n-bignum-imported/NOTICE new file mode 100644 index 00000000000..616fc588945 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/NOTICE @@ -0,0 +1 @@ +Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/third_party/s2n-bignum/s2n-bignum-imported/README.md b/third_party/s2n-bignum/s2n-bignum-imported/README.md new file mode 100644 index 00000000000..769bfd8497e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/README.md @@ -0,0 +1,512 @@ +## s2n-bignum + +s2n-bignum is a collection of integer arithmetic routines designed for +cryptographic applications. All routines are written in pure machine code, +designed to be callable from C and other high-level languages, with separate but +API-compatible versions of each function for 64-bit x86 (x86_64) and ARM +(aarch64). + +s2n-bignum's primary goals are performance and assurance: Assembly routines are +tuned for highest performance both by hand and using automatic optimization +techniques such as the [SLOTHY](https://github.com/slothy-optimizer/slothy) +superoptimizer, and each function is accompanied by a machine-checked formal +proof in [HOL-Light](https://hol-light.github.io/) that its mathematical +result is correct, based on a formal model of the underlying machine. Each +function is moreover written in a constant-time style to avoid timing +side-channels. + +### Building + +Assuming a suitable operating system (e.g. Linux, Mac OS X, or Windows with +Cygwin) and a few basic build tools you should be able to download the repo and +build with just a few basic commands. On an x86 machine: + + git clone https://github.com/awslabs/s2n-bignum + cd ./s2n-bignum + (cd ./x86; make) + +while on an ARM machine (aarch64, arm64) just replace "x86" with "arm": + + git clone https://github.com/awslabs/s2n-bignum + cd ./s2n-bignum + (cd ./arm; make) + +This results in a library of bignum mathematical functions that can be +called from C or other languages. To run basic unit tests on the library +just built: + + (cd ./tests; make go) + +To run the benchmarking code to get performance numbers for your platform +(this usually takes several minutes): + + (cd ./benchmarks; make go) + +The code is all written in assembler, with each individual mathematical +function consisting of a `.S` file that can be assembled by directly +invoking the GNU C compiler `gcc` or by explicitly combining the C +preprocessor and an assembler or other C or C++ compiler. If using your own +build command, consult the existing Makefiles for guidance since there are some +subtle variations even among assemblers (e.g. some C compilers won't handle +multiple instructions per line when taking in assembler files). + +### Using the library + +The build process above results in a library that can be used to provide all +the functionality together (e.g. `x86/libs2nbignum.a` for an x86 machine), +as well as individual object files, one per function, that can be used for more +fine-grained linkage (e.g. `x86/generic/bignum_add.o` for the addition +function on x86). The functions all use standard Application Binary Interfaces +to connect to C and other high-level languages; the ABI determines, for +example, which registers or stack frames hold the arguments to a function when +called. The x86+Windows combination uses a non-standard ABI, which can +explicitly be forced using the additional option `-DWINDOWS_ABI=1` when +building. In either case the C-level prototypes for the functions are collected +in a header file that can be included in C programs to specify the interfaces. +A quick browse through this also gives an idea of what functions the +library provides. + +[s2n-bignum/include/s2n-bignum.h](https://github.com/awslabs/s2n-bignum/blob/main/include/s2n-bignum.h) + +You can include this in a C program as usual, after first including +the standard header defining the types `uint64_t` etc. that are +basic for s2n-bignum: + + #include + #include "s2n-bignum.h" + +Here is a small complete C program `myprogram.c` calling the +library, computing the modular inverse of 12345 modulo the wordsize +using the `word_negmodinv` function provided by the library, +then printing out confirmation that it works: + +``` +#include +#include +#include "s2n-bignum.h" + +int main(void) +{ + uint64_t x = 12345; + uint64_t y = -word_negmodinv(x); + printf("%ld * %ld = %ld (mod 2^64)\n",x,y,x*y); +} +``` + +Assuming you are on an x86 machine in a directory above the +`s2n-bignum` subdirectory (otherwise change the `.` +below into an appropriate path and/or change `x86` to `arm`), +you can compile this as follows, specifying the paths to the +library itself and the headers: + + gcc -o myprogram myprogram.c -I./s2n-bignum/include/ -L./s2n-bignum/x86/ -ls2nbignum + +and then run it as usual to see the output: + + $ ./myprogram + 12345 * 5288216061308878345 = 1 (mod 2^64) + +### Architectural and microarchitectural considerations + +The overall C-level interface supported by the library is the same +regardless of architecture, ARM or x86. In each case, however, there +are some architectural and microarchitectural considerations to be +aware of: + + * On ARM, each function will work correctly on any existing + microarchitecture. However, some functions have two variants + with significant performance differences according to platform. + The versions with `_alt` suffixes are designed to maximize + performance on microarchitectures with higher multiplier + throughput (typically more recent ones, like the Apple M1), while + the non-alt variants are better suited to 'traditional' ARM + microarchitectures with lower multiplier throughput (specifically, + limited pipelining of the `UMULH` instruction to get the + high part of a 64x64-bit product). + + * On x86, all generic bignum functions (in the `x86/generic` + subdirectory) will work correctly on any existing microarchitecture. + Some of the more highly optimized functions for specific elliptic + curves etc. require the BMI and ADX instruction set extensions + (specifically the `MULX`, `ADCX` and `ADOX` instructions). + In such cases, the `_alt` suffix forms are provided + as a backup that will work for older platforms. In all cases where + there is such an alt form provided, the non-alt form is likely to be + faster where those instructions are supported, as on most recent + x86-64 chips. + +If you are unsure which version of a function to use on your platform, a simple +test is to run the benchmarking code (see above) and examine the results. For +example, this is a contemporary ARM platform where the alt form performs +better: + +``` +... +curve25519_x25519 : 26661.8 ns each (var 0.8%, corr 0.03) = 37507 ops/sec +curve25519_x25519_alt : 19297.7 ns each (var 0.4%, corr -0.03) = 51820 ops/sec +... +``` + +and this is a typical x86 chip where the non-alt form is faster: + +``` +... +curve25519_x25519 : 30103.0 ns each (var 0.0%, corr -0.14) = 33219 ops/sec +curve25519_x25519_alt : 38097.0 ns each (var 0.0%, corr -0.11) = 26249 ops/sec +... +``` + +while this is a very old x86 machine where the required instructions for +the non-alt form are not supported: + +``` +... +curve25519_x25519 : *** NOT APPLICABLE *** +curve25519_x25519_alt : 51977.2 ns each (var 1.4%, corr 0.01) = 19239 ops/sec +... +``` + +### Constant-time bignums + +The s2n-bignum library provides a simple and flexible API for manipulating +bignums, which are integers of arbitrary size (operations focus on nonnegative +integers, but use 2s complement where appropriate for negation). The integers +are represented as little-endian arrays of unsigned 64-bit "digits", where the +digits can be accessed via the standard `uint64_t` type in C. They can be +explicitly read and written as normal C arrays as well as via the s2n-bignum +API. For example, here is how one might set up the constant 2255-19 +as a 4-digit bignum (note the little-endian digit representation, independent +of the byte order of the underlying machine): + +``` +uint64_t p_25519[4] = +{ + UINT64_C(0xffffffffffffffed), + UINT64_C(0xffffffffffffffff), + UINT64_C(0xffffffffffffffff), + UINT64_C(0x7fffffffffffffff) +}; +``` + +The arrays can be arbitrarily large or small and the sizes can be runtime +parameters, with no overall restriction to specific sizes like 4 in the example +above. However, in contrast to many standard bignum interfaces like that +supported by [GMP](https://gmplib.org/), the operations do not dynamically +adjust the sizes, but require them to be explicitly specified by the user when +calling each function. The reason for this is to allow flexibility and +genericity while also enforcing "constant-time" behavior for security from +timing side-channels in cryptographic applications. + +By "constant-time" we mean roughly that a given bignum operation takes a time +that is independent of the actual numbers involved, depending only on their +*nominal* sizes. Each s2n-bignum operation takes and returns bignums +of specified nominal sizes, and manipulates them on the basis of the nominal +sizes only, independent of their actual numeric values (even if those are +zero). If a result does not fit in the size provided, it is systematically +truncated modulo that size. s2n-bignum functions never strip away leading +zeros to make numbers shorter, nor do they allocate extra space to make them +longer; indeed, they perform no memory allocation or other OS calls at all. +For instance, the basic multiplication function has the following C prototype: + +``` +void bignum_mul(uint64_t p,uint64_t *z, uint64_t m,uint64_t *x, uint64_t n,uint64_t *y); +``` + +This means that `x` points to an `m`-digit bignum (little-endian, with +64-bit words as the digits), `y` points to an `n`-digit bignum, and the +function writes their product to the `p`-word buffer pointed to by `z`, +truncating it modulo 264p if it doesn't fit. In this setting +with nominal sizes for all numbers, the "constant-time" characteristic means +that the actual sequence of machine instructions executed, including the +specific addresses and sequencing of memory loads and stores, is +*independent of the numbers themselves*, depending only on their nominal sizes +(`m`, `n` and `p` for the above example). + +Since the s2n-bignum interface is just using pointers to pre-existing arrays, +any allocation of memory is the caller's responsibility. Some s2n-bignum +functions use space on the stack for intermediate computations (or just to save +and restore registers), but only in cases where that size is bounded and +moderate. For the few generic-size functions that need similarly generic (and +hence unbounded a priori) space for intermediate storage, it needs to be +provided by the caller via an additional argument. For example, the final +argument to the `bignum_modinv` (modular inverse) function is to a temporary +buffer of a size depending on the generic size parameter `k` (specifically, +according to the API it should be `>= 3 * k`): + +``` +void bignum_modinv (uint64_t k, uint64_t *z, uint64_t *a, uint64_t *b, uint64_t *t); +``` + +In order to keep the generic API more convenient, minimizing the need for such +additional parameters, functions sometimes read from and write to the provided +buffers in interleaved fashion in a way that assumes inputs and outputs do not +overlap. Aliasing of input and output buffers is however usually allowed in +fixed-size functions and (provided they are exactly the same, not overlapped in +more intricate fashion) "linear" generic-sized functions; consult the detailed +API reference for more details. + +### What's in the library? + +The s2n-bignum library supports basic bignum arithmetic using the API specified +above, as well as a host of related operations, the aim being to provide +convenient and reliable building-blocks for higher-level cryptographic +functionality. The range of operations provided covers: + +- Elementary operations on 64-bit words, mainly to provide reference + implementations that are constant-time, e.g. `word_max` (maximum), + `word_clz` (counting leading zeros) + +- Basic generic-size bignum arithmetic functionality like `bignum_add` + (addition), `bignum_sub` (subtraction), `bignum_mul` (multiplication), + `bignum_eq` (equality comparison). + +- Generic-size constant-time data manipulation like `bignum_digit` (selecting + a digit, like array indexing but without any difference in memory access + pattern from element number) and `bignum_mux` (multiplexing or if-then-else, + analogous to C `b ? x : y`. + +- Generic-size Montgomery operations like `bignum_montmul` (Montgomery + multiplication), `bignum_montredc` (Montgomery reduction) and + `bignum_montifier` (computes constant for mapping into Montgomery form) + for performing modular arithmetic in Montgomery form for any odd modulus. + +- Optimized multiplication and squaring operations for specific sizes, e.g. + `bignum_mul_4_8` (multiply two 4-digit numbers with 8-digit result) and + `bignum_sqr_16_32` (square a 16-digit number with 32-digit result). + +- Optimized modular and/or Montgomery arithmetic operations for common + primes that are field characteristics for specific elliptic curves, + e.g. `bignum_montmul_p521` (Montgomery multiplication modulo + 2521-1) for NIST P-521, `bignum_sqr_p25519` (modular + squaring modulo 2255-19 for curve25519). + +- Full top-level point operations for supported elliptic curves, e.g. + `p256_jadd` (point addition on NIST P-256 curve), `secp256k1_jdouble` + (point doubling for secp256k1). These usually assume a particular + coordinate representation, Jacobian in these cases (hence the "j"). + +The elliptic curves with some special support are the following; the degree of +support varies from just modular and/or Montgomery arithmetic operations for +the field characteristic modulus, up to basic point operations, and even in +some cases full scalar multiplication (e.g. `curve25519_x25519`). + +- curve25519/edwards25519 +- NIST P-256 +- NIST P-384 +- NIST P-521 +- secp256k1 +- SM2 + +### Testing and formal verification + +The basic testing setup as mentioned above subjects each function to a number +of unit tests, mainly using pseudo-random inputs and comparing against +conceptually simpler (but neither efficient nor constant-time) C references, +also doing some checking of pre-tabulated "known correct" results. This +process + + (cd ./tests; make go) + +should be enough to expose any basic problems, typically failure to assemble +and link the code correctly. However, in pursuit of the highest standards of +correctness, that basic testing is complemented by the far more rigorous and +sophisticated process of *formal verification*. + +The formal verification process performs a machine-checked proof that the +actual object file generated by the build process satisfies a high-level +mathematical specification for *all* inputs (not just for specific test cases), +assuming a formal model of how each processor (ARM or x86) executes code. These +models make some simplifications and idealizations but model pretty faithfully +the way in which specific machine instructions modify registers, flags and +memory. + +To perform the formal proof for a particular function, you will need to install +the latest version of [HOL Light](https://github.com/jrh13/hol-light/). +The OPAM version might not work because it does not contain sufficiently recent +libraries. +To install HOL Light, please follow its +[README](https://github.com/jrh13/hol-light/blob/master/README) instruction. +After installation, set the `HOLDIR` environment variable to the path of +the `hol-light` directory and use the Makefile within either the `arm` or +`x86` directories to generate a target of the form +`function_name.correct` for a corresponding object file `function_name.o`. +Alternatively, the entire collection of functions can all be formally proved +via the `proofs` pseudo-target. This is likely to be very time-consuming and +hence better executed with some parallelism, e.g. + + nohup make -j 16 proofs & + +The proof process is controlled by a corresponding "proof script" in the +`proofs` subdirectory with corresponding name `proofs/function_name.ml` +The technical details of how the machine is modeled and how the proof is +performed are too involved to enter into in detail in this brief summary, +but by examining the proof script file you can find detailed specifications +for each function, which might be considered the most rigorous possible +form of API documentation. + +For example the file `arm/proofs/bignum_mul_p25519.ml` starts with a lengthy +sequence of 32-bit words that specify the machine code being verified. This is +not just accepted a priori as the canonical machine code, but actually checked +against the object file to make sure it is indeed what is generated by the +build process. The later proof then shows that executing this on the idealized +machine model guarantees some toplevel mathematical properties. In this case, +the specification that is proved looks like this: + +``` +nonoverlapping (word pc,0x288) (z,8 * 4) + ==> ensures arm + (\s. aligned_bytes_loaded s (word pc) bignum_mul_p25519_mc /\ + read PC s = word pc /\ + read X30 s = returnaddress /\ + C_ARGUMENTS [z; x; y] s /\ + bignum_from_memory(x,4) s = m /\ + bignum_from_memory(y,4) s = n) + (\s. read PC s = returnaddress /\ + bignum_from_memory(z,4) s = (m * n) MOD p_25519) + (MAYCHANGE [PC; X1; X2; X3; X4; X5; X6; X7; X8; X9; + X10; X11; X12; X13; X14; X15; X16; X17] ,, + MAYCHANGE [memory :> bytes(z,8 * 4)] ,, + MAYCHANGE SOME_FLAGS) +``` + +A detailed understanding of these formal specifications would take careful +study of the underlying logical definitions, but in somewhat general +impressionistic terms we can turn it into English as follows: + + - We assume the output buffer `z` doesn't overlap the code being executed + but otherwise make no aliasing assumptions for inputs versus outputs. + + - ASSUMING that we start in a state where + + - the machine code specified at the start is loaded (4-byte aligned + as per ARM restrictions) and the program counter register `PC` + points to the start of it + + - the return address to the caller is in register `X30` as per ABI + + - the pointers `z`, `x` and `y` are set up in registers according + to the standard ABI rules + + - the pointers `x` and `y` point at 4-digits bignums with respective + values `m` and `n` + + - THEN we will reach another state where + + - The program counter `PC` has jumped to the return address + + - The buffer pointed to by `z` contains the mathematical answer + (x * y) mod p_25519, where p_25519 is an abbreviation for + 2255-19. + + - BETWEEN initial and final states, the only components of the + machine that could have been modified are: + + - Registers including the program counter (of course) and general + purpose registers `X1`, ..., `X17` (freely modifiable by a subroutine + according to the ABI) + + - The specified output buffer at address `z` of size 4 x 8-byte words. + + - The machine flags (also freely modifiable according to the ABI) + +**Global Assumptions.** +In addition to the assumptions described in the formal specifications, +s2n-bignum implementations globally assume that the execution environment is +configured as following: + +- Alignment checking is disabled (`AC` flag in x86, `SCTLR_ELx.A` in ARM). + If these control bits are set, passing unaligned pointers as input/output + buffers of a s2n-bignum function may cause a crash. If you are invoking the + functions from C/C++ via the C header file (`s2n-bignum.h`) however, the + alignment restriction on int-typed pointers in C standard such as `uint64_t*` + will guarantee that the pointers are aligned regardless of the control bit. + The alignment conditions for code and stack pointers in ARM will be + explicitly described in the formal specifications. + + +- Little-endian is set in ARM (`E` mask of `CPSR` in ARM). We believe all code + works equally well on a big-endian machine, but we do not validate that fact + ourselves, and the instruction model underlying the formal proof does not + directly address this question since it is assuming little-endian. + +- It is assumed that s2n-bignum is run on 64-bit mode. + +### Benchmarking and "constant time" + +The benchmarking setup included in the repository can be invoked, as mentioned +above, by the following, starting in the s2n-bignum root directory and after +building the library: + + (cd ./benchmarks; make go) + +After some explanatory information which summarizes the explanations below, +this shows a list of the execution time behavior of each function on the +current platform, one per line in alphabetical order; generic-size functions +like `bignum_add` are exercised on one or more specific sizes as shown in +parentheses after the function name. + +``` +bignum_add (4x4->4) : 3.4 ns each (var 1.6%, corr 0.07) = 296073608 ops/sec +bignum_add (6x6->6) : 4.3 ns each (var 1.3%, corr 0.02) = 233426704 ops/sec +bignum_add (32x32->32) : 18.4 ns each (var 0.8%, corr -0.01) = 54430655 ops/sec +bignum_add_p25519 : 2.2 ns each (var 2.9%, corr -0.01) = 462501779 ops/sec +bignum_add_p256 : 2.9 ns each (var 1.6%, corr -0.01) = 342429670 ops/sec +bignum_add_p256k1 : 2.6 ns each (var 1.9%, corr -0.04) = 387458274 ops/sec +bignum_add_p384 : 4.4 ns each (var 1.1%, corr -0.03) = 226923614 ops/sec +bignum_add_p521 : 4.3 ns each (var 1.4%, corr 0.02) = 232991612 ops/sec +bignum_amontifier (32) : 2993.4 ns each (var 0.1%, corr -0.08) = 334073 ops/sec +bignum_amontmul (32) : 2410.8 ns each (var 0.0%, corr -0.04) = 414797 ops/sec +bignum_amontredc (32/16 -> 16) : 317.1 ns each (var 0.1%, corr -0.01) = 3153693 ops/sec +bignum_amontsqr (32 -> 32) : 2410.2 ns each (var 0.0%, corr 0.05) = 414901 ops/sec +... +word_max : 0.8 ns each (var 4.2%, corr -0.03) = 1234333460 ops/sec +word_min : 0.8 ns each (var 3.4%, corr 0.05) = 1237623762 ops/sec +word_negmodinv : 2.7 ns each (var 2.0%, corr -0.11) = 366568915 ops/sec +word_recip : 7.4 ns each (var 0.9%, corr -0.06) = 134380815 ops/sec +``` + +The first number reported is the average runtime, in nanoseconds (1 ns = +10-9 seconds, or one billionth of a second), over a large number of +calls, and the last one is the reciprocal of this to give the average number of +operations per second. Hence "smaller is better" for the first number while +"bigger is better" for the final one. + +The "var" and "corr" numbers in parentheses attempt to give some empirical +results on the variation in runtime with respect to the data being manipulated. +Since this is intended to be invariant, one wishes these numbers to be small, +though there is inevitably some variation because of miscellaneous platform +factors. For each "bit density" between 0 and 64, pseudo-random inputs are +generated with that bit density; the bit density is essentially the average +number of 1 bits in each 64-bit word of these pseudo-random numbers (so bit +density 0 means all zeros, bit density 64 means all 1s). The function is +separately timed over each of these. The end results give the coefficient of +variation "var" (standard deviation divided by mean) and correlation +coefficient "corr" of runtime with bit density. + +As explained above, the "constant time" design principle is that the sequence +of machine instructions executed, including the access pattern of memory reads +and writes, is independent of the actual numeric data being manipulated, once +any parametric sizes are fixed. Any failures in practice to actually take +exactly the same time on all data (beyond some expected experimental errors +and flaws in the timing framework) could only arise if either: + + - The above "constant time" design discipline is not followed at all points + as intended. We consider this very unlikely, but in contrast to functional + correctness it is not actually rigorously machine-checked at present. We + anticipate in the future subjecting the code to automated dataflow analysis + as an additional validation test. + + - Some individual machine instructions that are used take a time that depends + on their data. We have specifically avoided certain machine instructions + known to be problematic in this respect (e.g. division instructions), but + we have no absolute guarantees from the hardware makers that there are no + such variations in the instructions we use, except on ARM platforms where + the "DIT" = "data-independent timing" bit is set. + +## Security + +See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. + +## License + +This project is licensed under the Apache-2.0 License or the ISC License or the MIT-0 License. diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/Makefile new file mode 100644 index 00000000000..e1d37985dd3 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/Makefile @@ -0,0 +1,518 @@ +############################################################################# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +############################################################################# + +OSTYPE_RESULT=$(shell uname -s) +ARCHTYPE_RESULT=$(shell uname -m) + +# Assembler directives that mark symbols as .hidden +# or .private_extern can be enabled by passing +# in the S2N_BN_HIDE_SYMBOLS parameter as: +# +# make S2N_BN_HIDE_SYMBOLS=1 +# + +ifeq ($(S2N_BN_HIDE_SYMBOLS),1) +SYMBOL_HIDING=-DS2N_BN_HIDE_SYMBOLS=1 +else +SYMBOL_HIDING= +endif + + +# Add explicit language input parameter to cpp, otherwise the use of #n for +# numeric literals in ARM code is a problem when used inside #define macros +# since normally that means stringization. +# +# Some clang-based preprocessors seem to behave differently, and get confused +# by single-quote characters in comments, so we eliminate // comments first. + +ifeq ($(OSTYPE_RESULT),Darwin) +PREPROCESS=sed -e 's/\/\/.*//' | $(CC) -E -I../include $(SYMBOL_HIDING) -xassembler-with-cpp - +else +PREPROCESS=$(CC) -E -I../include $(SYMBOL_HIDING) -xassembler-with-cpp - +endif + +# Generally GNU-type assemblers are happy with multiple instructions on +# a line, but we split them up anyway just in case. + +SPLIT=tr ';' '\n' + +# If actually on an ARM8 machine, just use the assembler (as). Otherwise +# use a cross-assembling version so that the code can still be assembled +# and the proofs checked against the object files (though you won't be able +# to run code without additional emulation infrastructure). For the clang +# version on OS X we just add the "-arch arm64" option. For the Linux/gcc +# toolchain we assume the presence of the special cross-assembler. This +# can be installed via something like: +# +# sudo apt-get install binutils-aarch64-linux-gnu + +ifeq ($(ARCHTYPE_RESULT),aarch64) +ASSEMBLE=as +OBJDUMP=objdump -d +else +ifeq ($(ARCHTYPE_RESULT),arm64) +ASSEMBLE=as +OBJDUMP=objdump -d +else +ifeq ($(OSTYPE_RESULT),Darwin) +ASSEMBLE=as -arch arm64 +OBJDUMP=otool -tvV +else +ASSEMBLE=aarch64-linux-gnu-as +OBJDUMP=aarch64-linux-gnu-objdump -d +endif +endif +endif + +# List of object files for point operations and bignum operations + +POINT_OBJ = curve25519/curve25519_ladderstep.o \ + curve25519/curve25519_ladderstep_alt.o \ + curve25519/curve25519_pxscalarmul.o \ + curve25519/curve25519_pxscalarmul_alt.o \ + curve25519/curve25519_x25519.o \ + curve25519/curve25519_x25519_alt.o \ + curve25519/curve25519_x25519_byte.o \ + curve25519/curve25519_x25519_byte_alt.o \ + curve25519/curve25519_x25519base.o \ + curve25519/curve25519_x25519base_alt.o \ + curve25519/curve25519_x25519base_byte.o \ + curve25519/curve25519_x25519base_byte_alt.o \ + curve25519/edwards25519_decode.o \ + curve25519/edwards25519_decode_alt.o \ + curve25519/edwards25519_encode.o \ + curve25519/edwards25519_epadd.o \ + curve25519/edwards25519_epadd_alt.o \ + curve25519/edwards25519_epdouble.o \ + curve25519/edwards25519_epdouble_alt.o \ + curve25519/edwards25519_pdouble.o \ + curve25519/edwards25519_pdouble_alt.o \ + curve25519/edwards25519_pepadd.o \ + curve25519/edwards25519_pepadd_alt.o \ + curve25519/edwards25519_scalarmulbase.o \ + curve25519/edwards25519_scalarmulbase_alt.o \ + curve25519/edwards25519_scalarmuldouble.o \ + curve25519/edwards25519_scalarmuldouble_alt.o \ + p256/p256_montjadd.o \ + p256/p256_montjadd_alt.o \ + p256/p256_montjdouble.o \ + p256/p256_montjdouble_alt.o \ + p256/p256_montjmixadd.o \ + p256/p256_montjmixadd_alt.o \ + p256/p256_montjscalarmul.o \ + p256/p256_montjscalarmul_alt.o \ + p256/p256_scalarmul.o \ + p256/p256_scalarmul_alt.o \ + p256/p256_scalarmulbase.o \ + p256/p256_scalarmulbase_alt.o \ + p384/p384_montjadd.o \ + p384/p384_montjadd_alt.o \ + p384/p384_montjdouble.o \ + p384/p384_montjdouble_alt.o \ + p384/p384_montjmixadd.o \ + p384/p384_montjmixadd_alt.o \ + p384/p384_montjscalarmul.o \ + p384/p384_montjscalarmul_alt.o \ + p521/p521_jadd.o \ + p521/p521_jadd_alt.o \ + p521/p521_jdouble.o \ + p521/p521_jdouble_alt.o \ + p521/p521_jmixadd.o \ + p521/p521_jmixadd_alt.o \ + p521/p521_jscalarmul.o \ + p521/p521_jscalarmul_alt.o \ + secp256k1/secp256k1_jadd.o \ + secp256k1/secp256k1_jadd_alt.o \ + secp256k1/secp256k1_jdouble.o \ + secp256k1/secp256k1_jdouble_alt.o \ + secp256k1/secp256k1_jmixadd.o \ + secp256k1/secp256k1_jmixadd_alt.o \ + sm2/sm2_montjadd.o \ + sm2/sm2_montjadd_alt.o \ + sm2/sm2_montjdouble.o \ + sm2/sm2_montjdouble_alt.o \ + sm2/sm2_montjmixadd.o \ + sm2/sm2_montjmixadd_alt.o \ + sm2/sm2_montjscalarmul.o \ + sm2/sm2_montjscalarmul_alt.o + +BIGNUM_OBJ = curve25519/bignum_add_p25519.o \ + curve25519/bignum_cmul_p25519.o \ + curve25519/bignum_double_p25519.o \ + curve25519/bignum_inv_p25519.o \ + curve25519/bignum_invsqrt_p25519.o \ + curve25519/bignum_invsqrt_p25519_alt.o \ + curve25519/bignum_madd_n25519.o \ + curve25519/bignum_madd_n25519_alt.o \ + curve25519/bignum_mod_m25519_4.o \ + curve25519/bignum_mod_n25519.o \ + curve25519/bignum_mod_n25519_4.o \ + curve25519/bignum_mod_p25519_4.o \ + curve25519/bignum_mul_p25519.o \ + curve25519/bignum_mul_p25519_alt.o \ + curve25519/bignum_neg_p25519.o \ + curve25519/bignum_optneg_p25519.o \ + curve25519/bignum_sqr_p25519.o \ + curve25519/bignum_sqr_p25519_alt.o \ + curve25519/bignum_sqrt_p25519.o \ + curve25519/bignum_sqrt_p25519_alt.o \ + curve25519/bignum_sub_p25519.o \ + fastmul/bignum_emontredc_8n.o \ + fastmul/bignum_emontredc_8n_cdiff.o \ + fastmul/bignum_kmul_16_32.o \ + fastmul/bignum_kmul_32_64.o \ + fastmul/bignum_ksqr_16_32.o \ + fastmul/bignum_ksqr_32_64.o \ + fastmul/bignum_mul_4_8.o \ + fastmul/bignum_mul_4_8_alt.o \ + fastmul/bignum_mul_6_12.o \ + fastmul/bignum_mul_6_12_alt.o \ + fastmul/bignum_mul_8_16.o \ + fastmul/bignum_mul_8_16_alt.o \ + fastmul/bignum_sqr_4_8.o \ + fastmul/bignum_sqr_4_8_alt.o \ + fastmul/bignum_sqr_6_12.o \ + fastmul/bignum_sqr_6_12_alt.o \ + fastmul/bignum_sqr_8_16.o \ + fastmul/bignum_sqr_8_16_alt.o \ + generic/bignum_add.o \ + generic/bignum_amontifier.o \ + generic/bignum_amontmul.o \ + generic/bignum_amontredc.o \ + generic/bignum_amontsqr.o \ + generic/bignum_bitfield.o \ + generic/bignum_bitsize.o \ + generic/bignum_cdiv.o \ + generic/bignum_cdiv_exact.o \ + generic/bignum_cld.o \ + generic/bignum_clz.o \ + generic/bignum_cmadd.o \ + generic/bignum_cmnegadd.o \ + generic/bignum_cmod.o \ + generic/bignum_cmul.o \ + generic/bignum_coprime.o \ + generic/bignum_copy.o \ + generic/bignum_copy_row_from_table.o \ + generic/bignum_copy_row_from_table_8n.o \ + generic/bignum_copy_row_from_table_16.o \ + generic/bignum_copy_row_from_table_32.o \ + generic/bignum_ctd.o \ + generic/bignum_ctz.o \ + generic/bignum_demont.o \ + generic/bignum_digit.o \ + generic/bignum_digitsize.o \ + generic/bignum_divmod10.o \ + generic/bignum_emontredc.o \ + generic/bignum_eq.o \ + generic/bignum_even.o \ + generic/bignum_ge.o \ + generic/bignum_gt.o \ + generic/bignum_iszero.o \ + generic/bignum_le.o \ + generic/bignum_lt.o \ + generic/bignum_madd.o \ + generic/bignum_modadd.o \ + generic/bignum_moddouble.o \ + generic/bignum_modexp.o \ + generic/bignum_modifier.o \ + generic/bignum_modinv.o \ + generic/bignum_modoptneg.o \ + generic/bignum_modsub.o \ + generic/bignum_montifier.o \ + generic/bignum_montmul.o \ + generic/bignum_montredc.o \ + generic/bignum_montsqr.o \ + generic/bignum_mul.o \ + generic/bignum_muladd10.o \ + generic/bignum_mux.o \ + generic/bignum_mux16.o \ + generic/bignum_negmodinv.o \ + generic/bignum_nonzero.o \ + generic/bignum_normalize.o \ + generic/bignum_odd.o \ + generic/bignum_of_word.o \ + generic/bignum_optadd.o \ + generic/bignum_optneg.o \ + generic/bignum_optsub.o \ + generic/bignum_optsubadd.o \ + generic/bignum_pow2.o \ + generic/bignum_shl_small.o \ + generic/bignum_shr_small.o \ + generic/bignum_sqr.o \ + generic/bignum_sub.o \ + generic/word_bytereverse.o \ + generic/word_clz.o \ + generic/word_ctz.o \ + generic/word_divstep59.o \ + generic/word_max.o \ + generic/word_min.o \ + generic/word_negmodinv.o \ + generic/word_popcount.o \ + generic/word_recip.o \ + p256/bignum_add_p256.o \ + p256/bignum_bigendian_4.o \ + p256/bignum_cmul_p256.o \ + p256/bignum_deamont_p256.o \ + p256/bignum_demont_p256.o \ + p256/bignum_double_p256.o \ + p256/bignum_half_p256.o \ + p256/bignum_inv_p256.o \ + p256/bignum_littleendian_4.o \ + p256/bignum_mod_n256.o \ + p256/bignum_mod_n256_4.o \ + p256/bignum_mod_p256.o \ + p256/bignum_mod_p256_4.o \ + p256/bignum_montinv_p256.o \ + p256/bignum_montmul_p256.o \ + p256/bignum_montmul_p256_alt.o \ + p256/bignum_montsqr_p256.o \ + p256/bignum_montsqr_p256_alt.o \ + p256/bignum_mux_4.o \ + p256/bignum_neg_p256.o \ + p256/bignum_nonzero_4.o \ + p256/bignum_optneg_p256.o \ + p256/bignum_sub_p256.o \ + p256/bignum_tomont_p256.o \ + p256/bignum_triple_p256.o \ + p384/bignum_add_p384.o \ + p384/bignum_bigendian_6.o \ + p384/bignum_cmul_p384.o \ + p384/bignum_deamont_p384.o \ + p384/bignum_demont_p384.o \ + p384/bignum_double_p384.o \ + p384/bignum_half_p384.o \ + p384/bignum_inv_p384.o \ + p384/bignum_littleendian_6.o \ + p384/bignum_mod_n384.o \ + p384/bignum_mod_n384_6.o \ + p384/bignum_mod_p384.o \ + p384/bignum_mod_p384_6.o \ + p384/bignum_montinv_p384.o \ + p384/bignum_montmul_p384.o \ + p384/bignum_montmul_p384_alt.o \ + p384/bignum_montsqr_p384.o \ + p384/bignum_montsqr_p384_alt.o \ + p384/bignum_mux_6.o \ + p384/bignum_neg_p384.o \ + p384/bignum_nonzero_6.o \ + p384/bignum_optneg_p384.o \ + p384/bignum_sub_p384.o \ + p384/bignum_tomont_p384.o \ + p384/bignum_triple_p384.o \ + p521/bignum_add_p521.o \ + p521/bignum_cmul_p521.o \ + p521/bignum_deamont_p521.o \ + p521/bignum_demont_p521.o \ + p521/bignum_double_p521.o \ + p521/bignum_fromlebytes_p521.o \ + p521/bignum_half_p521.o \ + p521/bignum_inv_p521.o \ + p521/bignum_mod_n521_9.o \ + p521/bignum_mod_p521_9.o \ + p521/bignum_montmul_p521.o \ + p521/bignum_montmul_p521_alt.o \ + p521/bignum_montsqr_p521.o \ + p521/bignum_montsqr_p521_alt.o \ + p521/bignum_mul_p521.o \ + p521/bignum_mul_p521_alt.o \ + p521/bignum_neg_p521.o \ + p521/bignum_optneg_p521.o \ + p521/bignum_sqr_p521.o \ + p521/bignum_sqr_p521_alt.o \ + p521/bignum_sub_p521.o \ + p521/bignum_tolebytes_p521.o \ + p521/bignum_tomont_p521.o \ + p521/bignum_triple_p521.o \ + secp256k1/bignum_add_p256k1.o \ + secp256k1/bignum_cmul_p256k1.o \ + secp256k1/bignum_deamont_p256k1.o \ + secp256k1/bignum_demont_p256k1.o \ + secp256k1/bignum_double_p256k1.o \ + secp256k1/bignum_half_p256k1.o \ + secp256k1/bignum_mod_n256k1_4.o \ + secp256k1/bignum_mod_p256k1_4.o \ + secp256k1/bignum_montmul_p256k1.o \ + secp256k1/bignum_montmul_p256k1_alt.o \ + secp256k1/bignum_montsqr_p256k1.o \ + secp256k1/bignum_montsqr_p256k1_alt.o \ + secp256k1/bignum_mul_p256k1.o \ + secp256k1/bignum_mul_p256k1_alt.o \ + secp256k1/bignum_neg_p256k1.o \ + secp256k1/bignum_optneg_p256k1.o \ + secp256k1/bignum_sqr_p256k1.o \ + secp256k1/bignum_sqr_p256k1_alt.o \ + secp256k1/bignum_sub_p256k1.o \ + secp256k1/bignum_tomont_p256k1.o \ + secp256k1/bignum_triple_p256k1.o \ + sm2/bignum_add_sm2.o \ + sm2/bignum_cmul_sm2.o \ + sm2/bignum_deamont_sm2.o \ + sm2/bignum_demont_sm2.o \ + sm2/bignum_double_sm2.o \ + sm2/bignum_half_sm2.o \ + sm2/bignum_inv_sm2.o \ + sm2/bignum_mod_nsm2.o \ + sm2/bignum_mod_nsm2_4.o \ + sm2/bignum_mod_sm2.o \ + sm2/bignum_mod_sm2_4.o \ + sm2/bignum_montinv_sm2.o \ + sm2/bignum_montmul_sm2.o \ + sm2/bignum_montmul_sm2_alt.o \ + sm2/bignum_montsqr_sm2.o \ + sm2/bignum_montsqr_sm2_alt.o \ + sm2/bignum_neg_sm2.o \ + sm2/bignum_optneg_sm2.o \ + sm2/bignum_sub_sm2.o \ + sm2/bignum_tomont_sm2.o \ + sm2/bignum_triple_sm2.o + +UNOPT_OBJ = p256/unopt/bignum_montmul_p256_base.o \ + p256/unopt/bignum_montsqr_p256_base.o \ + p256/unopt/p256_montjadd.o \ + p256/unopt/p256_montjdouble.o \ + p384/unopt/bignum_montmul_p384_base.o \ + p384/unopt/bignum_montsqr_p384_base.o \ + p384/unopt/p384_montjadd.o \ + p384/unopt/p384_montjdouble.o \ + p521/unopt/bignum_montmul_p521_base.o \ + p521/unopt/bignum_montsqr_p521_base.o \ + p521/unopt/bignum_mul_p521_base.o \ + p521/unopt/bignum_sqr_p521_base.o \ + fastmul/unopt/bignum_emontredc_8n_base.o \ + fastmul/unopt/bignum_emontredc_8n_cdiff_base.o \ + fastmul/unopt/bignum_mul_8_16_base.o \ + fastmul/unopt/bignum_sqr_8_16_base.o + +OBJ = $(POINT_OBJ) $(BIGNUM_OBJ) + +# Tutorial assembly files + +TUTORIAL_PROOFS = $(wildcard tutorial/*.ml) + +TUTORIAL_OBJ = $(TUTORIAL_PROOFS:.ml=.o) tutorial/rel_loop2.o tutorial/rel_simp2.o tutorial/rel_veceq2.o tutorial/rel_equivtac2.o tutorial/rel_reordertac2.o + +# According to +# https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms, +# x18 should not be used for Apple platforms. Check this using grep. + +%.o : %.S + cat $< | $(PREPROCESS) | $(SPLIT) | grep -v -E '^\s+.quad\s+0x[0-9a-f]+$$' | $(ASSEMBLE) -o $@ - + $(OBJDUMP) $@ | ( ( ! grep --ignore-case -E 'w18|[^0]x18' ) || ( rm $@ ; exit 1 ) ) + cat $< | $(PREPROCESS) | $(SPLIT) | $(ASSEMBLE) -o $@ - + +libs2nbignum.a: $(OBJ) ; ar -rc libs2nbignum.a $(OBJ) + +clean:; rm -f libs2nbignum.a */*.o */*/*.o */*.correct */*.native + +# Proof-related parts +# +# The proof files are all independent, though each one loads the +# same common infrastructure "base.ml". So you can potentially +# run the proofs in parallel for more speed, e.g. +# +# nohup make -j 16 proofs & +# +# If you build hol-light yourself (see https://github.com/jrh13/hol-light) +# in your home directory, and do "make" inside the subdirectory hol-light, +# then the following HOLDIR setting should be right: + +HOLDIR?=$(HOME)/hol-light +HOLLIGHT:=$(HOLDIR)/hol.sh + +PROOF_BINS = $(OBJ:.o=.native) +PROOF_LOGS = $(OBJ:.o=.correct) +TUTORIAL_PROOF_BINS = $(TUTORIAL_PROOFS:.ml=.native) +TUTORIAL_PROOF_LOGS = $(TUTORIAL_PROOFS:.ml=.correct) + +# Build precompiled native binaries of HOL Light proofs + +proofs/simulator.native: proofs/simulator.ml ; ../tools/build-proof.sh proofs/simulator.ml "$(HOLLIGHT)" "$@" + +.SECONDEXPANSION: +%.native: proofs/$$(*F).ml %.o ; ../tools/build-proof.sh "$<" "$(HOLLIGHT)" "$@" + +# Run them and print the standard output+error at *.correct + +%.correct: %.native ; ../tools/run-proof.sh "$<" "$@" + +# Cases where a proof uses other proofs for lemmas and/or subroutines + +p256/bignum_montmul_p256.native: p256/unopt/bignum_montmul_p256_base.o +p384/bignum_montmul_p384.native: p384/unopt/bignum_montmul_p384_base.o +p521/bignum_montmul_p521.native: p521/unopt/bignum_montmul_p521_base.o +p256/bignum_montsqr_p256.native: p256/unopt/bignum_montsqr_p256_base.o +p384/bignum_montsqr_p384.native: p384/unopt/bignum_montsqr_p384_base.o +p521/bignum_montsqr_p521.native: p521/unopt/bignum_montsqr_p521_base.o +p521/bignum_mul_p521.native: p521/unopt/bignum_mul_p521_base.o +p521/bignum_sqr_p521.native: p521/unopt/bignum_sqr_p521_base.o +fastmul/bignum_emontredc_8n_cdiff.native: fastmul/unopt/bignum_emontredc_8n_base.o fastmul/unopt/bignum_emontredc_8n_cdiff_base.o +fastmul/bignum_mul_8_16.native: fastmul/unopt/bignum_mul_8_16_base.o +fastmul/bignum_sqr_8_16.native: fastmul/unopt/bignum_sqr_8_16_base.o +curve25519/curve25519_x25519.native: curve25519/bignum_inv_p25519.native +curve25519/curve25519_x25519_alt.native: curve25519/bignum_inv_p25519.native +curve25519/curve25519_x25519_byte.native: curve25519/bignum_inv_p25519.native +curve25519/curve25519_x25519_byte_alt.native: curve25519/bignum_inv_p25519.native +curve25519/curve25519_x25519base.native: curve25519/bignum_inv_p25519.native +curve25519/curve25519_x25519base_alt.native: curve25519/bignum_inv_p25519.native +curve25519/curve25519_x25519base_byte.native: curve25519/bignum_inv_p25519.native +curve25519/curve25519_x25519base_byte_alt.native: curve25519/bignum_inv_p25519.native +curve25519/edwards25519_scalarmulbase.native: curve25519/bignum_inv_p25519.native +curve25519/edwards25519_scalarmulbase_alt.native: curve25519/bignum_inv_p25519.native +curve25519/edwards25519_scalarmuldouble.native: curve25519/bignum_inv_p25519.native +curve25519/edwards25519_scalarmuldouble_alt.native: curve25519/bignum_inv_p25519.native +generic/bignum_modexp.native: generic/bignum_amontifier.native generic/bignum_amontmul.native generic/bignum_demont.native generic/bignum_mux.native +p256/p256_montjadd.native: p256/unopt/p256_montjadd.o p256/bignum_montsqr_p256.native p256/bignum_montmul_p256.native p256/bignum_sub_p256.native +p256/p256_montjdouble.native: p256/unopt/p256_montjdouble.o p256/bignum_montsqr_p256.native p256/bignum_montmul_p256.native p256/bignum_sub_p256.native p256/bignum_add_p256.native +p256/p256_montjscalarmul.native: p256/p256_montjadd.native p256/p256_montjdouble.native +p256/p256_montjscalarmul_alt.native: p256/p256_montjadd_alt.native p256/p256_montjdouble_alt.native +p256/p256_scalarmul.native: p256/bignum_demont_p256.native p256/bignum_inv_p256.native p256/bignum_tomont_p256.native p256/p256_montjadd.native p256/p256_montjdouble.native p256/p256_montjmixadd.native +p256/p256_scalarmul_alt.native: p256/bignum_demont_p256.native p256/bignum_inv_p256.native p256/p256_montjadd_alt.native p256/p256_montjdouble_alt.native p256/p256_montjmixadd_alt.native +p256/p256_scalarmulbase.native: p256/bignum_demont_p256.native p256/bignum_inv_p256.native p256/p256_montjmixadd.native +p256/p256_scalarmulbase_alt.native: p256/bignum_demont_p256.native p256/bignum_inv_p256.native p256/p256_montjmixadd_alt.native +p384/p384_montjadd.native: p384/unopt/p384_montjadd.o p384/bignum_montsqr_p384.native p384/bignum_montmul_p384.native p384/bignum_sub_p384.native +p384/p384_montjdouble.native: p384/unopt/p384_montjdouble.o p384/bignum_montsqr_p384.native p384/bignum_montmul_p384.native p384/bignum_sub_p384.native p384/bignum_add_p384.native +p384/p384_montjscalarmul.native: \ + p384/p384_montjadd.native p384/p384_montjdouble.native \ + p384/bignum_sub_p384.native p384/bignum_add_p384.native +p384/p384_montjscalarmul_alt.native: p384/p384_montjadd_alt.native p384/p384_montjdouble_alt.native +p521/p521_jadd.native: p521/bignum_mul_p521.native p521/bignum_sqr_p521.native +p521/p521_jdouble.native: p521/bignum_mul_p521.native p521/bignum_sqr_p521.native +p521/p521_jscalarmul.native: p521/bignum_mod_n521_9.native p521/p521_jadd.native p521/p521_jdouble.native +p521/p521_jscalarmul_alt.native: p521/bignum_mod_n521_9.native +sm2/sm2_montjscalarmul.native: sm2/sm2_montjadd.native sm2/sm2_montjdouble.native +sm2/sm2_montjscalarmul_alt.native: sm2/sm2_montjadd_alt.native sm2/sm2_montjdouble_alt.native + +# Tutorial + +.SECONDEXPANSION: +tutorial/%.native: tutorial/%.ml tutorial/%.o ; ../tools/build-proof.sh "$<" "$(HOLLIGHT)" "$@" +# Additional dependencies on .o files +tutorial/rel_loop.native: tutorial/rel_loop2.o +tutorial/rel_simp.native: tutorial/rel_simp2.o +tutorial/rel_veceq.native: tutorial/rel_veceq2.o +tutorial/rel_equivtac.native: tutorial/rel_equivtac2.o +tutorial/rel_reordertac.native: tutorial/rel_reordertac2.o + + +unopt: $(UNOPT_OBJ) + +build_proofs: $(UNOPT_OBJ) $(PROOF_BINS) +# Conservatively check that there is no redefinition of "check_axioms" +# '-I' excludes binary files (*.native). + ! grep -RI "check_axioms" . ../common/ --exclude="Makefile" +build_tutorial: $(TUTORIAL_OBJ) $(TUTORIAL_PROOF_BINS); +run_proofs: build_proofs $(PROOF_LOGS); + +proofs: run_proofs ; ../tools/count-proofs.sh . +tutorial: build_tutorial $(TUTORIAL_PROOF_LOGS); + +# Always run sematest regardless of dependency check +FORCE: ; +# Always use max. # of cores because in Makefile one cannot get the passed number of -j. +# A portable way of getting the number of max. cores: +# https://stackoverflow.com/a/23569003/1488216 +NUM_CORES_FOR_SEMATEST = $(shell getconf _NPROCESSORS_ONLN) +sematest: FORCE $(OBJ) proofs/simulator_iclasses.ml proofs/simulator.native + ../tools/run-sematest.sh arm $(NUM_CORES_FOR_SEMATEST) diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/allowed_asm b/third_party/s2n-bignum/s2n-bignum-imported/arm/allowed_asm new file mode 100644 index 00000000000..e511343b85a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/allowed_asm @@ -0,0 +1,169 @@ +: adc$ +: adcs$ +: add$ +: adds$ +: adr$ +: and$ +: and.16b$ +: ands$ +: asr$ +: b$ +: bcax$ +: bcax.16b$ +: bfi$ +: bic$ +: bic.8h$ +: bit$ +: bit.16b$ +: bl$ +: ccmn$ +: ccmp$ +: cinc$ +: cinv$ +: clz$ +: cmhi$ +: cmhi.8h$ +: cmn$ +: cmp$ +: cnt$ +: cnt.16b$ +: cneg$ +: csel$ +: cset$ +: csetm$ +: dup$ +: dup.2d$ +: eor$ +: eor3$ +: eor3.16b$ +: ext$ +: ext.16b$ +: extr$ +: fcsel$ +: fmov$ +: ld1r$ +: ld1r.2d$ +: ldp$ +: ldr$ +: ldrb$ +: ldur$ +: lsl$ +: lsr$ +: madd$ +: mls$ +: mls.2s$ +: mls.8h$ +: mneg$ +: mov$ +: mov.d$ +: movi$ +: movi.2d$ +: movk$ +: msub$ +: mul$ +: mul.4s$ +: mvn$ +: neg$ +: negs$ +: ngc$ +: ngcs$ +: orr$ +: rax1$ +: rax1.2d$ +: ret$ +: rev64$ +: rev64.4s$ +: ror$ +: sbc$ +: sbcs$ +: sbfx$ +: shl$ +: shl.2d$ +: shrn$ +: shrn.2s$ +: sli$ +: sli.2d$ +: smlal$ +: smlal.2d$ +: smlal2$ +: smlal2.2d$ +: smlsl$ +: smlsl.2d$ +: smlsl2$ +: smlsl2.2d$ +: smulh$ +: smull$ +: smull.2d$ +: smull2$ +: smull2.2d$ +: sqdmulh$ +: sqdmulh.4s$ +: sqdmulh.8h$ +: sqdmulh.s$ +: sqrdmulh$ +: sqrdmulh.2s$ +: sqrdmulh.4s$ +: sqrdmulh.8h$ +: sri$ +: sri.2d$ +: sri.4h$ +: srshr$ +: srshr.2d$ +: srshr.8h$ +: sshr$ +: sshr.8h$ +: stp$ +: str$ +: strb$ +: stur$ +: sub$ +: subs$ +: trn1$ +: trn1.16b$ +: trn1.2d$ +: trn1.2s$ +: trn1.4s$ +: trn2$ +: trn2.2d$ +: trn2.2s$ +: trn2.4s$ +: tst$ +: uaddlp$ +: uaddlp.2d$ +: uaddlv$ +: uaddlv.8h$ +: ubfx$ +: umaddl$ +: umlal$ +: umlal.2d$ +: umlal2$ +: umlal2.2d$ +: umlsl$ +: umlsl.2d$ +: umlsl2$ +: umlsl2.2d$ +: umulh$ +: umull$ +: umull.2d$ +: umull2$ +: umull2.2d$ +: ushr$ +: ushr.2d$ +: ushr.8h$ +: usra$ +: usra.2d$ +: uzp1$ +: uzp1.4s$ +: uzp2$ +: uzp2.4s$ +: xar$ +: xar.2d$ +: xtn$ +: xtn.2s$ +: zip1$ +: zip1.2s$ +: zip1.4s$ +: zip2$ +: zip2.2s$ +: zip2.4s$ +: $ diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/Makefile new file mode 100644 index 00000000000..b22696783d4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/Makefile @@ -0,0 +1,77 @@ +############################################################################# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +############################################################################# + +# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise +# use a cross-assembling version so that the code can still be assembled +# and the proofs checked against the object files (though you won't be able +# to run code without additional emulation infrastructure). The aarch64 +# cross-assembling version can be installed manually by something like: +# +# sudo apt-get install binutils-aarch64-linux-gnu + +UNAME_RESULT=$(shell uname -p) + +ifeq ($(UNAME_RESULT),aarch64) +GAS=as +else +GAS=aarch64-linux-gnu-as +endif + +# List of object files + +OBJ = bignum_add_p25519.o \ + bignum_cmul_p25519.o \ + bignum_double_p25519.o \ + bignum_inv_p25519.o \ + bignum_invsqrt_p25519.o \ + bignum_invsqrt_p25519_alt.o \ + bignum_madd_n25519.o \ + bignum_madd_n25519_alt.o \ + bignum_mod_m25519_4.o \ + bignum_mod_n25519.o \ + bignum_mod_n25519_4.o \ + bignum_mod_p25519_4.o \ + bignum_mul_p25519.o \ + bignum_mul_p25519_alt.o \ + bignum_neg_p25519.o \ + bignum_optneg_p25519.o \ + bignum_sqr_p25519.o \ + bignum_sqr_p25519_alt.o \ + bignum_sqrt_p25519.o \ + bignum_sqrt_p25519_alt.o \ + bignum_sub_p25519.o \ + curve25519_ladderstep.o \ + curve25519_ladderstep_alt.o \ + curve25519_pxscalarmul.o \ + curve25519_pxscalarmul_alt.o \ + curve25519_x25519.o \ + curve25519_x25519_alt.o \ + curve25519_x25519_byte.o \ + curve25519_x25519_byte_alt.o \ + curve25519_x25519base.o \ + curve25519_x25519base_alt.o \ + curve25519_x25519base_byte.o \ + curve25519_x25519base_byte_alt.o \ + edwards25519_decode.o \ + edwards25519_decode_alt.o \ + edwards25519_encode.o \ + edwards25519_epadd.o \ + edwards25519_epadd_alt.o \ + edwards25519_epdouble.o \ + edwards25519_epdouble_alt.o \ + edwards25519_pdouble.o \ + edwards25519_pdouble_alt.o \ + edwards25519_pepadd.o \ + edwards25519_pepadd_alt.o \ + edwards25519_scalarmulbase.o \ + edwards25519_scalarmulbase_alt.o \ + edwards25519_scalarmuldouble.o \ + edwards25519_scalarmuldouble_alt.o + +%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - + +default: $(OBJ); + +clean:; rm -f *.o *.correct diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_add_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_add_p25519.S new file mode 100644 index 00000000000..9a538925f3c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_add_p25519.S @@ -0,0 +1,74 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_add_p25519 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p25519) + + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 +#define d0 x3 +#define d1 x4 +#define d2 x5 +#define d3 x6 +#define c0 x7 +#define c1 x8 +#define c2 x9 +#define c3 x10 + +S2N_BN_SYMBOL(bignum_add_p25519): + +// Add as [d3; d2; d1; d0] = x + y; since we assume x, y < 2^255 - 19 +// this sum fits in 256 bits + + ldp d0, d1, [x] + ldp c0, c1, [y] + adds d0, d0, c0 + adcs d1, d1, c1 + ldp d2, d3, [x, #16] + ldp c0, c1, [y, #16] + adcs d2, d2, c0 + adc d3, d3, c1 + +// Now x + y >= 2^255 - 19 <=> x + y + (2^255 + 19) >= 2^256 +// Form [c3; c2; c1; c0] = (x + y) + (2^255 + 19), with CF for the comparison + + mov c3, #0x8000000000000000 + adds c0, d0, #19 + adcs c1, d1, xzr + adcs c2, d2, xzr + adcs c3, d3, c3 + +// If the comparison holds, select [c3; c2; c1; c0]. There's no need to mask +// it since in this case it is ((x + y) + (2^255 + 19)) - 2^256 because the +// top carry is lost, which is the desired (x + y) - (2^255 - 19). + + csel d0, d0, c0, cc + csel d1, d1, c1, cc + csel d2, d2, c2, cc + csel d3, d3, c3, cc + +// Store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_cmul_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_cmul_p25519.S new file mode 100644 index 00000000000..883007c6c39 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_cmul_p25519.S @@ -0,0 +1,99 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming +// x reduced +// Inputs c, x[4]; output z[4] +// +// extern void bignum_cmul_p25519 +// (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = c, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p25519) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p25519_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p25519_alt) + .text + .balign 4 + +#define z x0 +#define m x1 +#define x x2 + +#define d0 x3 +#define d1 x4 +#define d2 x5 +#define d3 x6 + +#define a0 x7 +#define a1 x8 +#define a2 x9 +#define a3 x10 + +// Aliased to the a0,..,a3 when they are no longer needed + +#define l x7 +#define q x8 +#define c x9 +#define d4 x10 +#define h x10 + +S2N_BN_SYMBOL(bignum_cmul_p25519): +S2N_BN_SYMBOL(bignum_cmul_p25519_alt): + +// First do the multiply, straightforwardly to get [d4;d3;d2;d1;d0] + + ldp a0, a1, [x] + ldp a2, a3, [x, #16] + mul d0, m, a0 + mul d1, m, a1 + mul d2, m, a2 + mul d3, m, a3 + umulh a0, m, a0 + umulh a1, m, a1 + umulh a2, m, a2 + umulh d4, m, a3 + adds d1, d1, a0 + adcs d2, d2, a1 + adcs d3, d3, a2 + adcs d4, d4, xzr + +// Let 2^255 * h + l = [d4,d3,d2,d1,d0] = c * x, and use q = h + 1 +// as the initial quotient estimate, either right or 1 too big. + + add q, d4, 1 + adds xzr, d3, d3 + orr d3, d3, #0x8000000000000000 + adc q, q, d4 + mov c, #19 + mul l, q, c + umulh h, q, c + adds d0, d0, l + adcs d1, d1, h + adcs d2, d2, xzr + adcs d3, d3, xzr + +// Correct if CF = 0 by subtracting 19, either way masking to +// 255 bits, i.e. by effectively adding p_25519 to the "full" answer + + csel c, c, xzr, cc + subs d0, d0, c + sbcs d1, d1, xzr + sbcs d2, d2, xzr + sbc d3, d3, xzr + and d3, d3, #~0x8000000000000000 + +// Finally store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_double_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_double_p25519.S new file mode 100644 index 00000000000..b2772a56a1f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_double_p25519.S @@ -0,0 +1,71 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_double_p25519 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_p25519) + + .text + .balign 4 + +#define z x0 +#define x x1 +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define c0 x6 +#define c1 x7 +#define c2 x8 +#define c3 x9 + +S2N_BN_SYMBOL(bignum_double_p25519): + +// Double by adding as [d3; d2; d1; d0] = 2 * x; since we assume +// x < 2^255 - 19 this result fits in 256 bits + + ldp d0, d1, [x] + adds d0, d0, d0 + adcs d1, d1, d1 + ldp d2, d3, [x, #16] + adcs d2, d2, d2 + adc d3, d3, d3 + +// Now 2 * x >= 2^255 - 19 <=> 2 * x + (2^255 + 19) >= 2^256 +// Form [c3; c2; c1; c0] = (2 * x) + (2^255 + 19), with CF for the comparison + + mov c3, #0x8000000000000000 + adds c0, d0, #19 + adcs c1, d1, xzr + adcs c2, d2, xzr + adcs c3, d3, c3 + +// If the comparison holds, select [c3; c2; c1; c0]. There's no need to mask +// it since in this case it is ((2 * x) + (2^255 + 19)) - 2^256 because the +// top carry is lost, which is the desired (2 * x) - (2^255 - 19). + + csel d0, d0, c0, cc + csel d1, d1, c1, cc + csel d2, d2, c2, cc + csel d3, d3, c3, cc + +// Store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_inv_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_inv_p25519.S new file mode 100644 index 00000000000..d45273a83b0 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_inv_p25519.S @@ -0,0 +1,1255 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular inverse modulo p_25519 = 2^255 - 19 +// Input x[4]; output z[4] +// +// extern void bignum_inv_p25519(uint64_t z[static 4],uint64_t x[static 4]); +// +// Assuming the 4-digit input x is coprime to p_25519, i.e. is not divisible +// by it, returns z < p_25519 such that x * z == 1 (mod p_25519). Note that +// x does not need to be reduced modulo p_25519, but the output always is. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p25519) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Used for the return pointer + +#define res x20 + +// Loop counter and d = 2 * delta value for divstep + +#define i x21 +#define d x22 + +// Registers used for matrix element magnitudes and signs + +#define m00 x10 +#define m01 x11 +#define m10 x12 +#define m11 x13 +#define s00 x14 +#define s01 x15 +#define s10 x16 +#define s11 x17 + +// Initial carries for combinations + +#define car0 x9 +#define car1 x19 + +// Input and output, plain registers treated according to pattern + +#define reg0 x0, #0 +#define reg1 x1, #0 +#define reg2 x2, #0 +#define reg3 x3, #0 +#define reg4 x4, #0 + +#define x x1, #0 +#define z x0, #0 + +// Pointer-offset pairs for temporaries on stack + +#define f sp, #0 +#define g sp, #(4*N) +#define u sp, #(8*N) +#define v sp, #(12*N) + +// Total size to reserve on the stack + +#define NSPACE #(16*N) + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix in +// registers as follows +// +// [ m00 m01] +// [ m10 m11] + +#define divstep59() \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x8, x4, #0x100, lsl #12 __LF \ + sbfx x8, x8, #21, #21 __LF \ + mov x11, #0x100000 __LF \ + add x11, x11, x11, lsl #21 __LF \ + add x9, x4, x11 __LF \ + asr x9, x9, #42 __LF \ + add x10, x5, #0x100, lsl #12 __LF \ + sbfx x10, x10, #21, #21 __LF \ + add x11, x5, x11 __LF \ + asr x11, x11, #42 __LF \ + mul x6, x8, x2 __LF \ + mul x7, x9, x3 __LF \ + mul x2, x10, x2 __LF \ + mul x3, x11, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #21, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #42 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #21, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #42 __LF \ + mul x6, x12, x2 __LF \ + mul x7, x13, x3 __LF \ + mul x2, x14, x2 __LF \ + mul x3, x15, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + mul x2, x12, x8 __LF \ + mul x3, x12, x9 __LF \ + mul x6, x14, x8 __LF \ + mul x7, x14, x9 __LF \ + madd x8, x13, x10, x2 __LF \ + madd x9, x13, x11, x3 __LF \ + madd x16, x15, x10, x6 __LF \ + madd x17, x15, x11, x7 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #22, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #43 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #22, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #43 __LF \ + mneg x2, x12, x8 __LF \ + mneg x3, x12, x9 __LF \ + mneg x4, x14, x8 __LF \ + mneg x5, x14, x9 __LF \ + msub m00, x13, x16, x2 __LF \ + msub m01, x13, x17, x3 __LF \ + msub m10, x15, x16, x4 __LF \ + msub m11, x15, x17, x5 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(bignum_inv_p25519): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Copy the input and the prime into the main f and g variables. +// Make sure x is reduced so that g <= f as assumed in the bound proof. + + mov x10, #-19 + mov x11, #-1 + stp x10, x11, [f] + mov x12, #0x7FFFFFFFFFFFFFFF + stp x11, x12, [f+2*N] + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #(2*N)] + mov x7, #19 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, cc + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [g] + stp x4, x5, [g+2*N] + +// Also maintain weakly reduced < 2*p_25519 vector [u,v] such that +// [f,g] == x * 2^{590-59*i} * [u,v] (mod p_25519) +// starting with [p_25519,x] == x * 2^{590-59*0} * [0,2^-590] (mod p_25519) + + stp xzr, xzr, [u] + stp xzr, xzr, [u+2*N] + + movbig(x10, 0xa0f9, 0x9e23, 0x7502, 0x2099) + movbig(x11, 0xa8c6, 0x8f3f, 0x1d13, 0x2595) + movbig(x12, 0x6c6c, 0x8938, 0x05ac, 0x5242) + movbig(x13, 0x2765, 0x08b2, 0x4177, 0x0615) + + stp x10, x11, [v] + stp x12, x13, [v+2*N] + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special tenth iteration after a uniform +// first 9. + + mov i, #10 + mov d, #1 + b bignum_inv_p25519_midloop + +bignum_inv_p25519_loop: + +// Separate the matrix elements into sign-magnitude pairs + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in stable registers for the [u,v] part and do [f,g] first. + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + + and x0, m10, s10 + and x1, m11, s11 + add car1, x0, x1 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + ldr x7, [f] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [g] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + +// Digit 1 of [f,g] + + ldr x7, [f+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g] + +// Digit 2 of [f,g] + + ldr x7, [f+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+N] + +// Digits 3 and 4 of [f,g] + + ldr x7, [f+3*N] + eor x1, x7, s00 + asr x3, x1, #63 + and x3, x3, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [g+3*N] + eor x1, x8, s01 + asr x0, x1, #63 + and x0, x0, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+2*N] + extr x5, x3, x5, #59 + str x5, [f+3*N] + + eor x1, x7, s10 + asr x5, x1, #63 + and x5, x5, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + asr x0, x1, #63 + and x0, x0, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [g+2*N] + extr x2, x5, x2, #59 + str x2, [g+3*N] + +// Now the computation of the updated u and v values and their +// modular reductions. A very similar accumulation except that +// the top words of u and v are unsigned and we don't shift. +// +// Digit 0 of [u,v] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v] + adc x3, x3, x1 + +// Digit 1 of [u,v] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+N] + adc x4, x4, x1 + +// Digit 2 of [u,v] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+2*N] + adc x2, x2, x1 + +// Digits 3 and 4 of u (top is unsigned) + + ldr x7, [u+3*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Modular reduction of u + + extr x6, x3, x5, #63 + ldp x0, x1, [u] + add x6, x6, x3, asr #63 + mov x3, #19 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x3, x6, x3 + ldr x6, [u+2*N] + adds x0, x0, x4 + adcs x1, x1, x3 + asr x3, x3, #63 + adcs x6, x6, x3 + adc x5, x5, x3 + stp x0, x1, [u] + stp x6, x5, [u+16] + +// Digits 3 and 4 of v (top is unsigned) + + eor x1, x7, s10 + and x5, s10, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + and x0, s11, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + +// Modular reduction of v + + extr x6, x5, x2, #63 + ldp x0, x1, [v] + add x6, x6, x5, asr #63 + mov x5, #19 + mul x4, x6, x5 + add x2, x2, x6, lsl #63 + smulh x5, x6, x5 + ldr x3, [v+2*N] + adds x0, x0, x4 + adcs x1, x1, x5 + asr x5, x5, #63 + adcs x3, x3, x5 + adc x2, x2, x5 + stp x0, x1, [v] + stp x3, x2, [v+16] + +bignum_inv_p25519_midloop: + + mov x1, d + ldr x2, [f] + ldr x3, [g] + divstep59() + mov d, x1 + +// Next iteration + + subs i, i, #1 + bne bignum_inv_p25519_loop + +// The 10th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + ldr x0, [f] + ldr x1, [g] + mul x0, x0, m00 + madd x1, x1, m01, x0 + asr x0, x1, #63 + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * [u,v] (mod p_25519) +// we want to flip the sign of u according to that of f. + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + eor s00, s00, x0 + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + eor s01, s01, x0 + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + eor s10, s10, x0 + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + eor s11, s11, x0 + +// Adjust the initial value to allow for complement instead of negation + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + +// Digit 0 of [u] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + +// Digit 1 of [u] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + +// Digit 2 of [u] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + +// Digits 3 and 4 of u (top is unsigned) + + ldr x7, [u+3*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Modular reduction of u, this time strictly 2^255-19. + + extr x6, x3, x5, #63 + ldp x0, x1, [u] + tst x3, x3 + cinc x6, x6, pl + mov x3, #19 + mul x4, x6, x3 + add x5, x5, x6, lsl #63 + smulh x6, x6, x3 + ldr x2, [u+2*N] + adds x0, x0, x4 + adcs x1, x1, x6 + asr x6, x6, #63 + adcs x2, x2, x6 + adcs x5, x5, x6 + csel x3, x3, xzr, mi + subs x0, x0, x3 + sbcs x1, x1, xzr + sbcs x2, x2, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + +// Store it back to the final output + + mov x4, res + stp x0, x1, [x4] + stp x2, x5, [x4, #16] + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_invsqrt_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_invsqrt_p25519.S new file mode 100644 index 00000000000..50d774d1f58 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_invsqrt_p25519.S @@ -0,0 +1,600 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Inverse square root modulo p_25519 = 2^255 - 19 +// Input x[4]; output function return (Legendre symbol) and z[4] +// +// extern int64_t bignum_invsqrt_p25519(uint64_t z[static 4],uint64_t x[static 4]); +// +// Given a 4-digit input x, returns a modular inverse square root mod p_25519, +// i.e. a z such that x * z^2 == 1 (mod p_25519), whenever one exists. The +// inverse square root z is chosen so that its LSB is even (note that p_25519-z +// is another possibility). The function return is the Legendre/Jacobi symbol +// (x//p_25519), which indicates whether indeed x has a modular inverse square +// root and hence whether the result is meaningful: +// +// 0: x is divisible by p_25519 so trivially there is no inverse square root +// +1: x is coprime to p_25519 and z is indeed an inverse square root +// -1: x is coprime to p_25519 but there is no (inverse or direct) square root +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_invsqrt_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_invsqrt_p25519) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define a sp, #0 +#define b sp, #(4*N) +#define s sp, #(8*N) +#define t sp, #(12*N) + +// Other temporary variables in register + +#define res x19 + +// Total size to reserve on the stack + +#define NSPACE #(16*N) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +// Macros wrapping up calls to the local subroutines + +#define mulp(dest,src1,src2) \ + add x0, dest __LF \ + add x1, src1 __LF \ + add x2, src2 __LF \ + bl bignum_invsqrt_p25519_mul_p25519 + +#define nsqr(dest,n,src) \ + add x0, dest __LF \ + mov x1, n __LF \ + add x2, src __LF \ + bl bignum_invsqrt_p25519_nsqr_p25519 + +S2N_BN_SYMBOL(bignum_invsqrt_p25519): + +// Save registers and make room for temporaries + + stp x19, x30, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Set up reduced version of the input argument a = x mod p_25519. Then +// get the candidate inverse square root s = a^{252-3} + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #19 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, lo + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [a] + stp x4, x5, [a+16] + + // Power 2^2 - 1 = 3 + + nsqr(t,1,a) + mulp(t,t,a) + + // Power 2^4 - 1 = 15 + + nsqr(s,2,t) + mulp(t,s,t) + + // Power 2^5 - 1 = 31 + + nsqr(s,1,t) + mulp(b,s,a) + + // Power 2^10 - 1 + + nsqr(s,5,b) + mulp(t,s,b) + + // Power 2^20 - 1 + + nsqr(s,10,t) + mulp(t,s,t) + + // Power 2^25 - 1 + + nsqr(s,5,t) + mulp(b,s,b) + + // Power 2^50 - 1 + + nsqr(s,25,b) + mulp(t,s,b) + + // Power 2^100 - 1 + nsqr(s,50,t) + mulp(t,s,t) + + // Power 2^125 - 1 + + nsqr(s,25,t) + mulp(b,s,b) + + // Power 2^250 - 1 + + nsqr(s,125,b) + mulp(b,s,b) + + // Power 2^252 - 3 + + nsqr(s,2,b) + mulp(s,s,a) + +// s = a^{2^252-3} is now one candidate inverse square root. +// Generate the other one t = s * j_25519 where j_25519 = sqrt(-1) + + movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0) + movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478) + movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7) + movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b) + stp x0, x1, [t] + stp x2, x3, [t+16] + mulp(t,s,t) + +// Now multiplex between them according to whether a * s^2 = 1 + + nsqr(b,1,s) + mulp(b,a,b) + + ldp x10, x11, [b] + eor x10, x10, #1 + ldp x12, x13, [b+16] + orr x10, x10, x11 + orr x12, x12, x13 + orr x10, x10, x12 + cmp x10, xzr + + ldp x10, x11, [s] + ldp x14, x15, [t] + csel x10, x10, x14, eq + csel x11, x11, x15, eq + ldp x12, x13, [s+16] + ldp x16, x17, [t+16] + csel x12, x12, x16, eq + csel x13, x13, x17, eq + +// For definiteness, choose "positive" (LSB=0) inverse square root + + mov x14, #-19 + subs x14, x14, x10 + mov x16, #-1 + sbcs x15, x16, x11 + mov x17, #0x7FFFFFFFFFFFFFFF + sbcs x16, x16, x12 + sbc x17, x17, x13 + + tst x10, #1 + csel x10, x10, x14, eq + csel x11, x11, x15, eq + csel x12, x12, x16, eq + csel x13, x13, x17, eq + + mov x2, res + stp x10, x11, [x2] + stp x12, x13, [x2, #16] + +// Determine if it is is indeed an inverse square root, also distinguishing +// the degenerate x * z^2 == 0 (mod p_25519) case, which is equivalent to +// x == 0 (mod p_25519). Hence return the Legendre-Jacobi symbol as required. + + add x0, b + mov x1, #1 + bl bignum_invsqrt_p25519_nsqr_p25519 + mulp(b,a,b) + + ldp x10, x11, [b] + eor x14, x10, #1 + ldp x12, x13, [b+16] + orr x10, x10, x11 + orr x14, x14, x11 + orr x12, x12, x13 + orr x10, x10, x12 + orr x14, x14, x12 + + cmp x14, xzr + mov x0, #1 + cneg x0, x0, ne + + cmp x10, xzr + csel x0, x0, xzr, ne + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x19, x30, [sp], 16 + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +bignum_invsqrt_p25519_mul_p25519: + ldp x3, x4, [x1] + ldp x5, x6, [x2] + umull x7, w3, w5 + lsr x17, x3, #32 + umull x15, w17, w5 + lsr x16, x5, #32 + umull x8, w16, w17 + umull x16, w3, w16 + adds x7, x7, x15, lsl #32 + lsr x15, x15, #32 + adc x8, x8, x15 + adds x7, x7, x16, lsl #32 + lsr x16, x16, #32 + adc x8, x8, x16 + mul x9, x4, x6 + umulh x10, x4, x6 + subs x4, x4, x3 + cneg x4, x4, lo + csetm x16, lo + adds x9, x9, x8 + adc x10, x10, xzr + subs x3, x5, x6 + cneg x3, x3, lo + cinv x16, x16, lo + mul x15, x4, x3 + umulh x3, x4, x3 + adds x8, x7, x9 + adcs x9, x9, x10 + adc x10, x10, xzr + cmn x16, #1 + eor x15, x15, x16 + adcs x8, x15, x8 + eor x3, x3, x16 + adcs x9, x3, x9 + adc x10, x10, x16 + ldp x3, x4, [x1, #16] + ldp x5, x6, [x2, #16] + umull x11, w3, w5 + lsr x17, x3, #32 + umull x15, w17, w5 + lsr x16, x5, #32 + umull x12, w16, w17 + umull x16, w3, w16 + adds x11, x11, x15, lsl #32 + lsr x15, x15, #32 + adc x12, x12, x15 + adds x11, x11, x16, lsl #32 + lsr x16, x16, #32 + adc x12, x12, x16 + mul x13, x4, x6 + umulh x14, x4, x6 + subs x4, x4, x3 + cneg x4, x4, lo + csetm x16, lo + adds x13, x13, x12 + adc x14, x14, xzr + subs x3, x5, x6 + cneg x3, x3, lo + cinv x16, x16, lo + mul x15, x4, x3 + umulh x3, x4, x3 + adds x12, x11, x13 + adcs x13, x13, x14 + adc x14, x14, xzr + cmn x16, #1 + eor x15, x15, x16 + adcs x12, x15, x12 + eor x3, x3, x16 + adcs x13, x3, x13 + adc x14, x14, x16 + ldp x3, x4, [x1, #16] + ldp x15, x16, [x1] + subs x3, x3, x15 + sbcs x4, x4, x16 + csetm x16, lo + ldp x15, x17, [x2] + subs x5, x15, x5 + sbcs x6, x17, x6 + csetm x17, lo + eor x3, x3, x16 + subs x3, x3, x16 + eor x4, x4, x16 + sbc x4, x4, x16 + eor x5, x5, x17 + subs x5, x5, x17 + eor x6, x6, x17 + sbc x6, x6, x17 + eor x16, x17, x16 + adds x11, x11, x9 + adcs x12, x12, x10 + adcs x13, x13, xzr + adc x14, x14, xzr + mul x2, x3, x5 + umulh x17, x3, x5 + mul x15, x4, x6 + umulh x1, x4, x6 + subs x4, x4, x3 + cneg x4, x4, lo + csetm x9, lo + adds x15, x15, x17 + adc x1, x1, xzr + subs x6, x5, x6 + cneg x6, x6, lo + cinv x9, x9, lo + mul x5, x4, x6 + umulh x6, x4, x6 + adds x17, x2, x15 + adcs x15, x15, x1 + adc x1, x1, xzr + cmn x9, #1 + eor x5, x5, x9 + adcs x17, x5, x17 + eor x6, x6, x9 + adcs x15, x6, x15 + adc x1, x1, x9 + adds x9, x11, x7 + adcs x10, x12, x8 + adcs x11, x13, x11 + adcs x12, x14, x12 + adcs x13, x13, xzr + adc x14, x14, xzr + cmn x16, #1 + eor x2, x2, x16 + adcs x9, x2, x9 + eor x17, x17, x16 + adcs x10, x17, x10 + eor x15, x15, x16 + adcs x11, x15, x11 + eor x1, x1, x16 + adcs x12, x1, x12 + adcs x13, x13, x16 + adc x14, x14, x16 + mov x3, #38 + umull x4, w11, w3 + add x4, x4, w7, uxtw + lsr x7, x7, #32 + lsr x11, x11, #32 + umaddl x11, w11, w3, x7 + mov x7, x4 + umull x4, w12, w3 + add x4, x4, w8, uxtw + lsr x8, x8, #32 + lsr x12, x12, #32 + umaddl x12, w12, w3, x8 + mov x8, x4 + umull x4, w13, w3 + add x4, x4, w9, uxtw + lsr x9, x9, #32 + lsr x13, x13, #32 + umaddl x13, w13, w3, x9 + mov x9, x4 + umull x4, w14, w3 + add x4, x4, w10, uxtw + lsr x10, x10, #32 + lsr x14, x14, #32 + umaddl x14, w14, w3, x10 + mov x10, x4 + lsr x17, x14, #31 + mov x5, #19 + umaddl x5, w5, w17, x5 + add x7, x7, x5 + adds x7, x7, x11, lsl #32 + extr x3, x12, x11, #32 + adcs x8, x8, x3 + extr x3, x13, x12, #32 + adcs x9, x9, x3 + extr x3, x14, x13, #32 + lsl x5, x17, #63 + eor x10, x10, x5 + adc x10, x10, x3 + mov x3, #19 + tst x10, #0x8000000000000000 + csel x3, x3, xzr, pl + subs x7, x7, x3 + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbc x10, x10, xzr + and x10, x10, #0x7fffffffffffffff + stp x7, x8, [x0] + stp x9, x10, [x0, #16] + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +bignum_invsqrt_p25519_nsqr_p25519: + +// Copy input argument into [x13;x12;x11;x10] + + ldp x10, x11, [x2] + ldp x12, x13, [x2, #16] + +// Main squaring loop, accumulating in [x13;x12;x11;x10] consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +bignum_invsqrt_p25519_loop: + umull x2, w10, w10 + lsr x14, x10, #32 + umull x3, w14, w14 + umull x14, w10, w14 + adds x2, x2, x14, lsl #33 + lsr x14, x14, #31 + adc x3, x3, x14 + umull x4, w11, w11 + lsr x14, x11, #32 + umull x5, w14, w14 + umull x14, w11, w14 + mul x15, x10, x11 + umulh x16, x10, x11 + adds x4, x4, x14, lsl #33 + lsr x14, x14, #31 + adc x5, x5, x14 + adds x15, x15, x15 + adcs x16, x16, x16 + adc x5, x5, xzr + adds x3, x3, x15 + adcs x4, x4, x16 + adc x5, x5, xzr + umull x6, w12, w12 + lsr x14, x12, #32 + umull x7, w14, w14 + umull x14, w12, w14 + adds x6, x6, x14, lsl #33 + lsr x14, x14, #31 + adc x7, x7, x14 + umull x8, w13, w13 + lsr x14, x13, #32 + umull x9, w14, w14 + umull x14, w13, w14 + mul x15, x12, x13 + umulh x16, x12, x13 + adds x8, x8, x14, lsl #33 + lsr x14, x14, #31 + adc x9, x9, x14 + adds x15, x15, x15 + adcs x16, x16, x16 + adc x9, x9, xzr + adds x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, xzr + subs x10, x10, x12 + sbcs x11, x11, x13 + csetm x16, lo + eor x10, x10, x16 + subs x10, x10, x16 + eor x11, x11, x16 + sbc x11, x11, x16 + adds x6, x6, x4 + adcs x7, x7, x5 + adcs x8, x8, xzr + adc x9, x9, xzr + umull x12, w10, w10 + lsr x5, x10, #32 + umull x13, w5, w5 + umull x5, w10, w5 + adds x12, x12, x5, lsl #33 + lsr x5, x5, #31 + adc x13, x13, x5 + umull x15, w11, w11 + lsr x5, x11, #32 + umull x14, w5, w5 + umull x5, w11, w5 + mul x4, x10, x11 + umulh x16, x10, x11 + adds x15, x15, x5, lsl #33 + lsr x5, x5, #31 + adc x14, x14, x5 + adds x4, x4, x4 + adcs x16, x16, x16 + adc x14, x14, xzr + adds x13, x13, x4 + adcs x15, x15, x16 + adc x14, x14, xzr + adds x4, x2, x6 + adcs x5, x3, x7 + adcs x6, x6, x8 + adcs x7, x7, x9 + csetm x16, lo + subs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x15 + sbcs x7, x7, x14 + adcs x8, x8, x16 + adc x9, x9, x16 + mov x10, #38 + umull x12, w6, w10 + add x12, x12, w2, uxtw + lsr x2, x2, #32 + lsr x6, x6, #32 + umaddl x6, w6, w10, x2 + mov x2, x12 + umull x12, w7, w10 + add x12, x12, w3, uxtw + lsr x3, x3, #32 + lsr x7, x7, #32 + umaddl x7, w7, w10, x3 + mov x3, x12 + umull x12, w8, w10 + add x12, x12, w4, uxtw + lsr x4, x4, #32 + lsr x8, x8, #32 + umaddl x8, w8, w10, x4 + mov x4, x12 + umull x12, w9, w10 + add x12, x12, w5, uxtw + lsr x5, x5, #32 + lsr x9, x9, #32 + umaddl x9, w9, w10, x5 + mov x5, x12 + lsr x13, x9, #31 + mov x11, #19 + umull x11, w11, w13 + add x2, x2, x11 + adds x10, x2, x6, lsl #32 + extr x12, x7, x6, #32 + adcs x11, x3, x12 + extr x12, x8, x7, #32 + adcs x12, x4, x12 + extr x14, x9, x8, #32 + lsl x15, x13, #63 + eor x5, x5, x15 + adc x13, x5, x14 + +// Loop as applicable + + subs x1, x1, #1 + bne bignum_invsqrt_p25519_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "pl" condition. + + adds x6, x10, #19 + adcs x7, x11, xzr + adcs x8, x12, xzr + adcs x9, x13, xzr + + csel x10, x10, x6, pl + csel x11, x11, x7, pl + csel x12, x12, x8, pl + csel x13, x13, x9, pl + bic x13, x13, #0x8000000000000000 + +// Copy result back into destination and return + + stp x10, x11, [x0] + stp x12, x13, [x0, #16] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_invsqrt_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_invsqrt_p25519_alt.S new file mode 100644 index 00000000000..ad05cdffe18 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_invsqrt_p25519_alt.S @@ -0,0 +1,463 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Inverse square root modulo p_25519 = 2^255 - 19 +// Input x[4]; output function return (Legendre symbol) and z[4] +// +// extern int64_t bignum_invsqrt_p25519_alt(uint64_t z[static 4],uint64_t x[static 4]); +// +// Given a 4-digit input x, returns a modular inverse square root mod p_25519, +// i.e. a z such that x * z^2 == 1 (mod p_25519), whenever one exists. The +// inverse square root z is chosen so that its LSB is even (note that p_25519-z +// is another possibility). The function return is the Legendre/Jacobi symbol +// (x//p_25519), which indicates whether indeed x has a modular inverse square +// root and hence whether the result is meaningful: +// +// 0: x is divisible by p_25519 so trivially there is no inverse square root +// +1: x is coprime to p_25519 and z is indeed an inverse square root +// -1: x is coprime to p_25519 but there is no (inverse or direct) square root +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_invsqrt_p25519_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_invsqrt_p25519_alt) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define a sp, #0 +#define b sp, #(4*N) +#define s sp, #(8*N) +#define t sp, #(12*N) + +// Other temporary variables in register + +#define res x19 + +// Total size to reserve on the stack + +#define NSPACE #(16*N) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +// Macros wrapping up calls to the local subroutines + +#define mulp(dest,src1,src2) \ + add x0, dest __LF \ + add x1, src1 __LF \ + add x2, src2 __LF \ + bl bignum_invsqrt_p25519_alt_mul_p25519 + +#define nsqr(dest,n,src) \ + add x0, dest __LF \ + mov x1, n __LF \ + add x2, src __LF \ + bl bignum_invsqrt_p25519_alt_nsqr_p25519 + +S2N_BN_SYMBOL(bignum_invsqrt_p25519_alt): + +// Save registers and make room for temporaries + + stp x19, x30, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Set up reduced version of the input argument a = x mod p_25519. Then +// get the candidate inverse square root s = a^{252-3} + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #19 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, lo + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [a] + stp x4, x5, [a+16] + + // Power 2^2 - 1 = 3 + + nsqr(t,1,a) + mulp(t,t,a) + + // Power 2^4 - 1 = 15 + + nsqr(s,2,t) + mulp(t,s,t) + + // Power 2^5 - 1 = 31 + + nsqr(s,1,t) + mulp(b,s,a) + + // Power 2^10 - 1 + + nsqr(s,5,b) + mulp(t,s,b) + + // Power 2^20 - 1 + + nsqr(s,10,t) + mulp(t,s,t) + + // Power 2^25 - 1 + + nsqr(s,5,t) + mulp(b,s,b) + + // Power 2^50 - 1 + + nsqr(s,25,b) + mulp(t,s,b) + + // Power 2^100 - 1 + nsqr(s,50,t) + mulp(t,s,t) + + // Power 2^125 - 1 + + nsqr(s,25,t) + mulp(b,s,b) + + // Power 2^250 - 1 + + nsqr(s,125,b) + mulp(b,s,b) + + // Power 2^252 - 3 + + nsqr(s,2,b) + mulp(s,s,a) + +// s = a^{2^252-3} is now one candidate inverse square root. +// Generate the other one t = s * j_25519 where j_25519 = sqrt(-1) + + movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0) + movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478) + movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7) + movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b) + stp x0, x1, [t] + stp x2, x3, [t+16] + mulp(t,s,t) + +// Now multiplex between them according to whether a * s^2 = 1 + + nsqr(b,1,s) + mulp(b,a,b) + + ldp x10, x11, [b] + eor x10, x10, #1 + ldp x12, x13, [b+16] + orr x10, x10, x11 + orr x12, x12, x13 + orr x10, x10, x12 + cmp x10, xzr + + ldp x10, x11, [s] + ldp x14, x15, [t] + csel x10, x10, x14, eq + csel x11, x11, x15, eq + ldp x12, x13, [s+16] + ldp x16, x17, [t+16] + csel x12, x12, x16, eq + csel x13, x13, x17, eq + +// For definiteness, choose "positive" (LSB=0) inverse square root + + mov x14, #-19 + subs x14, x14, x10 + mov x16, #-1 + sbcs x15, x16, x11 + mov x17, #0x7FFFFFFFFFFFFFFF + sbcs x16, x16, x12 + sbc x17, x17, x13 + + tst x10, #1 + csel x10, x10, x14, eq + csel x11, x11, x15, eq + csel x12, x12, x16, eq + csel x13, x13, x17, eq + + mov x2, res + stp x10, x11, [x2] + stp x12, x13, [x2, #16] + +// Determine if it is is indeed an inverse square root, also distinguishing +// the degenerate x * z^2 == 0 (mod p_25519) case, which is equivalent to +// x == 0 (mod p_25519). Hence return the Legendre-Jacobi symbol as required. + + add x0, b + mov x1, #1 + bl bignum_invsqrt_p25519_alt_nsqr_p25519 + mulp(b,a,b) + + ldp x10, x11, [b] + eor x14, x10, #1 + ldp x12, x13, [b+16] + orr x10, x10, x11 + orr x14, x14, x11 + orr x12, x12, x13 + orr x10, x10, x12 + orr x14, x14, x12 + + cmp x14, xzr + mov x0, #1 + cneg x0, x0, ne + + cmp x10, xzr + csel x0, x0, xzr, ne + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x19, x30, [sp], 16 + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +bignum_invsqrt_p25519_alt_mul_p25519: + ldp x3, x4, [x1] + ldp x7, x8, [x2] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x2, #16] + mul x11, x3, x9 + umulh x15, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x16, x3, x10 + adcs x15, x15, x11 + adc x16, x16, xzr + ldp x5, x6, [x1, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x15, x15, x11 + mul x11, x4, x10 + adcs x16, x16, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x15, x15, x11 + umulh x11, x4, x9 + adcs x16, x16, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x15, x15, x11 + mul x11, x5, x9 + adcs x16, x16, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x15, x15, x11 + umulh x11, x5, x8 + adcs x16, x16, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x15, x15, x11 + mul x11, x6, x8 + adcs x16, x16, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x16, x16, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + mov x7, #38 + mul x11, x7, x16 + umulh x9, x7, x16 + adds x12, x12, x11 + mul x11, x7, x3 + umulh x3, x7, x3 + adcs x13, x13, x11 + mul x11, x7, x4 + umulh x4, x7, x4 + adcs x14, x14, x11 + mul x11, x7, x5 + umulh x5, x7, x5 + adcs x15, x15, x11 + cset x16, hs + adds x15, x15, x4 + adc x16, x16, x5 + cmn x15, x15 + orr x15, x15, #0x8000000000000000 + adc x8, x16, x16 + mov x7, #19 + madd x11, x7, x8, x7 + adds x12, x12, x11 + adcs x13, x13, x9 + adcs x14, x14, x3 + adcs x15, x15, xzr + csel x7, x7, xzr, lo + subs x12, x12, x7 + sbcs x13, x13, xzr + sbcs x14, x14, xzr + sbc x15, x15, xzr + and x15, x15, #0x7fffffffffffffff + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +bignum_invsqrt_p25519_alt_nsqr_p25519: + +// Copy input argument into [x5;x4;x3;x2] (overwriting input pointer x20 + + ldp x6, x3, [x2] + ldp x4, x5, [x2, #16] + mov x2, x6 + +// Main squaring loop, accumulating in [x5;x4;x3;x2] consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +bignum_invsqrt_p25519_alt_loop: + mul x9, x2, x3 + umulh x10, x2, x3 + mul x11, x2, x5 + umulh x12, x2, x5 + mul x7, x2, x4 + umulh x6, x2, x4 + adds x10, x10, x7 + adcs x11, x11, x6 + mul x7, x3, x4 + umulh x6, x3, x4 + adc x6, x6, xzr + adds x11, x11, x7 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x6 + mul x7, x3, x5 + umulh x6, x3, x5 + adc x6, x6, xzr + adds x12, x12, x7 + adcs x13, x13, x6 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x6, hs + umulh x7, x2, x2 + mul x8, x2, x2 + adds x9, x9, x7 + mul x7, x3, x3 + adcs x10, x10, x7 + umulh x7, x3, x3 + adcs x11, x11, x7 + mul x7, x4, x4 + adcs x12, x12, x7 + umulh x7, x4, x4 + adcs x13, x13, x7 + mul x7, x5, x5 + adcs x14, x14, x7 + umulh x7, x5, x5 + adc x6, x6, x7 + mov x3, #38 + mul x7, x3, x12 + umulh x4, x3, x12 + adds x8, x8, x7 + mul x7, x3, x13 + umulh x13, x3, x13 + adcs x9, x9, x7 + mul x7, x3, x14 + umulh x14, x3, x14 + adcs x10, x10, x7 + mul x7, x3, x6 + umulh x6, x3, x6 + adcs x11, x11, x7 + cset x12, hs + adds x11, x11, x14 + adc x12, x12, x6 + cmn x11, x11 + bic x11, x11, #0x8000000000000000 + adc x2, x12, x12 + mov x3, #0x13 + mul x7, x3, x2 + adds x2, x8, x7 + adcs x3, x9, x4 + adcs x4, x10, x13 + adc x5, x11, xzr + +// Loop as applicable + + subs x1, x1, #1 + bne bignum_invsqrt_p25519_alt_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "pl" condition. + + adds x6, x2, #19 + adcs x7, x3, xzr + adcs x8, x4, xzr + adcs x9, x5, xzr + + csel x2, x2, x6, pl + csel x3, x3, x7, pl + csel x4, x4, x8, pl + csel x5, x5, x9, pl + bic x5, x5, #0x8000000000000000 + +// Copy result back into destination and return + + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_madd_n25519.S similarity index 73% rename from third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_madd_n25519.S index 0171271872d..e6fbc4e3fd5 100644 --- a/third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_madd_n25519.S @@ -39,9 +39,9 @@ // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 // Single round of modular reduction mod_n25519, mapping @@ -50,27 +50,27 @@ // close to the loop body of the bignum_mod_n25519 function. #define reduce(m4,m3,m2,m1,m0) \ - extr q, m4, m3, #60; \ - and m3, m3, #0x0FFFFFFFFFFFFFFF; \ - sub q, q, m4, lsr #60; \ - and t0, m4, #0xF000000000000000; \ - add m3, m3, t0; \ - mul t0, n0, q; \ - mul t1, n1, q; \ - umulh t2, n0, q; \ - adds t1, t1, t2; \ - umulh t2, n1, q; \ - adc t2, t2, xzr; \ - subs m0, m0, t0; \ - sbcs m1, m1, t1; \ - sbcs m2, m2, t2; \ - sbcs m3, m3, xzr; \ - csel t0, n0, xzr, cc; \ - csel t1, n1, xzr, cc; \ - adds m0, m0, t0; \ - and t2, t0, #0x1000000000000000; \ - adcs m1, m1, t1; \ - adcs m2, m2, xzr; \ + extr q, m4, m3, #60 __LF \ + and m3, m3, #0x0FFFFFFFFFFFFFFF __LF \ + sub q, q, m4, lsr #60 __LF \ + and t0, m4, #0xF000000000000000 __LF \ + add m3, m3, t0 __LF \ + mul t0, n0, q __LF \ + mul t1, n1, q __LF \ + umulh t2, n0, q __LF \ + adds t1, t1, t2 __LF \ + umulh t2, n1, q __LF \ + adc t2, t2, xzr __LF \ + subs m0, m0, t0 __LF \ + sbcs m1, m1, t1 __LF \ + sbcs m2, m2, t2 __LF \ + sbcs m3, m3, xzr __LF \ + csel t0, n0, xzr, cc __LF \ + csel t1, n1, xzr, cc __LF \ + adds m0, m0, t0 __LF \ + and t2, t0, #0x1000000000000000 __LF \ + adcs m1, m1, t1 __LF \ + adcs m2, m2, xzr __LF \ adc m3, m3, t2 // Special case of "reduce" with m4 = 0. As well as not using m4, @@ -78,24 +78,24 @@ // versus min (floor(m/2^252)) (2^63-1). #define reduce0(m3,m2,m1,m0) \ - lsr q, m3, #60; \ - and m3, m3, #0x0FFFFFFFFFFFFFFF; \ - mul t0, n0, q; \ - mul t1, n1, q; \ - umulh t2, n0, q; \ - adds t1, t1, t2; \ - umulh t2, n1, q; \ - adc t2, t2, xzr; \ - subs m0, m0, t0; \ - sbcs m1, m1, t1; \ - sbcs m2, m2, t2; \ - sbcs m3, m3, xzr; \ - csel t0, n0, xzr, cc; \ - csel t1, n1, xzr, cc; \ - adds m0, m0, t0; \ - and t2, t0, #0x1000000000000000; \ - adcs m1, m1, t1; \ - adcs m2, m2, xzr; \ + lsr q, m3, #60 __LF \ + and m3, m3, #0x0FFFFFFFFFFFFFFF __LF \ + mul t0, n0, q __LF \ + mul t1, n1, q __LF \ + umulh t2, n0, q __LF \ + adds t1, t1, t2 __LF \ + umulh t2, n1, q __LF \ + adc t2, t2, xzr __LF \ + subs m0, m0, t0 __LF \ + sbcs m1, m1, t1 __LF \ + sbcs m2, m2, t2 __LF \ + sbcs m3, m3, xzr __LF \ + csel t0, n0, xzr, cc __LF \ + csel t1, n1, xzr, cc __LF \ + adds m0, m0, t0 __LF \ + and t2, t0, #0x1000000000000000 __LF \ + adcs m1, m1, t1 __LF \ + adcs m2, m2, xzr __LF \ adc m3, m3, t2 S2N_BN_SYMBOL(bignum_madd_n25519): diff --git a/third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_madd_n25519_alt.S similarity index 66% rename from third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_madd_n25519_alt.S index d1cdfb2c3b8..45d984f4514 100644 --- a/third_party/s2n-bignum/arm/curve25519/bignum_madd_n25519_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_madd_n25519_alt.S @@ -39,9 +39,9 @@ // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 // Single round of modular reduction mod_n25519, mapping @@ -50,27 +50,27 @@ // close to the loop body of the bignum_mod_n25519 function. #define reduce(m4,m3,m2,m1,m0) \ - extr q, m4, m3, #60; \ - and m3, m3, #0x0FFFFFFFFFFFFFFF; \ - sub q, q, m4, lsr #60; \ - and t0, m4, #0xF000000000000000; \ - add m3, m3, t0; \ - mul t0, n0, q; \ - mul t1, n1, q; \ - umulh t2, n0, q; \ - adds t1, t1, t2; \ - umulh t2, n1, q; \ - adc t2, t2, xzr; \ - subs m0, m0, t0; \ - sbcs m1, m1, t1; \ - sbcs m2, m2, t2; \ - sbcs m3, m3, xzr; \ - csel t0, n0, xzr, cc; \ - csel t1, n1, xzr, cc; \ - adds m0, m0, t0; \ - and t2, t0, #0x1000000000000000; \ - adcs m1, m1, t1; \ - adcs m2, m2, xzr; \ + extr q, m4, m3, #60 __LF \ + and m3, m3, #0x0FFFFFFFFFFFFFFF __LF \ + sub q, q, m4, lsr #60 __LF \ + and t0, m4, #0xF000000000000000 __LF \ + add m3, m3, t0 __LF \ + mul t0, n0, q __LF \ + mul t1, n1, q __LF \ + umulh t2, n0, q __LF \ + adds t1, t1, t2 __LF \ + umulh t2, n1, q __LF \ + adc t2, t2, xzr __LF \ + subs m0, m0, t0 __LF \ + sbcs m1, m1, t1 __LF \ + sbcs m2, m2, t2 __LF \ + sbcs m3, m3, xzr __LF \ + csel t0, n0, xzr, cc __LF \ + csel t1, n1, xzr, cc __LF \ + adds m0, m0, t0 __LF \ + and t2, t0, #0x1000000000000000 __LF \ + adcs m1, m1, t1 __LF \ + adcs m2, m2, xzr __LF \ adc m3, m3, t2 // Special case of "reduce" with m4 = 0. As well as not using m4, @@ -78,24 +78,24 @@ // versus min (floor(m/2^252)) (2^63-1). #define reduce0(m3,m2,m1,m0) \ - lsr q, m3, #60; \ - and m3, m3, #0x0FFFFFFFFFFFFFFF; \ - mul t0, n0, q; \ - mul t1, n1, q; \ - umulh t2, n0, q; \ - adds t1, t1, t2; \ - umulh t2, n1, q; \ - adc t2, t2, xzr; \ - subs m0, m0, t0; \ - sbcs m1, m1, t1; \ - sbcs m2, m2, t2; \ - sbcs m3, m3, xzr; \ - csel t0, n0, xzr, cc; \ - csel t1, n1, xzr, cc; \ - adds m0, m0, t0; \ - and t2, t0, #0x1000000000000000; \ - adcs m1, m1, t1; \ - adcs m2, m2, xzr; \ + lsr q, m3, #60 __LF \ + and m3, m3, #0x0FFFFFFFFFFFFFFF __LF \ + mul t0, n0, q __LF \ + mul t1, n1, q __LF \ + umulh t2, n0, q __LF \ + adds t1, t1, t2 __LF \ + umulh t2, n1, q __LF \ + adc t2, t2, xzr __LF \ + subs m0, m0, t0 __LF \ + sbcs m1, m1, t1 __LF \ + sbcs m2, m2, t2 __LF \ + sbcs m3, m3, xzr __LF \ + csel t0, n0, xzr, cc __LF \ + csel t1, n1, xzr, cc __LF \ + adds m0, m0, t0 __LF \ + and t2, t0, #0x1000000000000000 __LF \ + adcs m1, m1, t1 __LF \ + adcs m2, m2, xzr __LF \ adc m3, m3, t2 S2N_BN_SYMBOL(bignum_madd_n25519_alt): diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_m25519_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_m25519_4.S new file mode 100644 index 00000000000..75f5e7ece4e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_m25519_4.S @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod m_25519 +// Input x[4]; output z[4] +// +// extern void bignum_mod_m25519_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Reduction is modulo the group order of curve25519/edwards25519. +// This is the full group order, 8 * the standard basepoint order. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_m25519_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_m25519_4) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define n0 x2 +#define n1 x3 +#define n2 x4 +#define n3 x5 + +#define d0 x6 +#define d1 x7 +#define d2 x8 +#define d3 x9 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(bignum_mod_m25519_4): + +// Load the complicated three words of m_25519 (the other being n2 = 0) + + movbig( n0, #0xc093, #0x18d2, #0xe7ae, #0x9f68) + movbig( n1, #0xa6f7, #0xcef5, #0x17bc, #0xe6b2) + mov n3, #0x8000000000000000 + +// Load the input number + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Do the subtraction. + + subs n0, d0, n0 + sbcs n1, d1, n1 + sbcs n2, d2, xzr + sbcs n3, d3, n3 + +// Now if the carry is *clear* (inversion at work) the subtraction carried +// and hence we should have done nothing, so we reset each n_i = d_i + + csel n0, d0, n0, cc + csel n1, d1, n1, cc + csel n2, d2, n2, cc + csel n3, d3, n3, cc + +// Store the end result + + stp n0, n1, [z] + stp n2, n3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/curve25519/bignum_mod_n25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_n25519.S similarity index 89% rename from third_party/s2n-bignum/arm/curve25519/bignum_mod_n25519.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_n25519.S index 2fe18ac7545..591baa50359 100644 --- a/third_party/s2n-bignum/arm/curve25519/bignum_mod_n25519.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_n25519.S @@ -45,9 +45,9 @@ // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 S2N_BN_SYMBOL(bignum_mod_n25519): @@ -55,7 +55,7 @@ S2N_BN_SYMBOL(bignum_mod_n25519): // If the input is already <= 3 words long, go to a trivial "copy" path cmp k, #4 - bcc short + bcc bignum_mod_n25519_short // Otherwise load the top 4 digits (top-down) and reduce k by 4 // This [m3;m2;m1;m0] is the initial x where we begin reduction. @@ -108,7 +108,7 @@ S2N_BN_SYMBOL(bignum_mod_n25519): // is similar to the sequence above except for the more refined quotient // estimation process. - cbz k, writeback + cbz k, bignum_mod_n25519_writeback bignum_mod_n25519_loop: @@ -158,28 +158,28 @@ bignum_mod_n25519_loop: // Finally write back [m3;m2;m1;m0] and return -writeback: +bignum_mod_n25519_writeback: stp m0, m1, [z] stp m2, m3, [z, #16] ret // Short case: just copy the input with zero-padding -short: +bignum_mod_n25519_short: mov m0, xzr mov m1, xzr mov m2, xzr mov m3, xzr - cbz k, writeback + cbz k, bignum_mod_n25519_writeback ldr m0, [x] subs k, k, #1 - beq writeback + beq bignum_mod_n25519_writeback ldr m1, [x, #8] subs k, k, #1 - beq writeback + beq bignum_mod_n25519_writeback ldr m2, [x, #16] - b writeback + b bignum_mod_n25519_writeback #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_n25519_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_n25519_4.S new file mode 100644 index 00000000000..ab86e7df944 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_n25519_4.S @@ -0,0 +1,104 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo basepoint order, z := x mod n_25519 +// Input x[4]; output z[4] +// +// extern void bignum_mod_n25519_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Reduction is modulo the order of the curve25519/edwards25519 basepoint. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n25519_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n25519_4) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define n0 x2 +#define n1 x3 + +#define d0 x4 +#define d1 x5 +#define d2 x6 +#define d3 x7 + +#define q x8 + +#define m0 x9 +#define m1 x10 +#define m2 x11 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(bignum_mod_n25519_4): + +// Load the complicated three words of n_25519. +// The others are n2 = 0 and n3 = 0x1000000000000000, which +// are handled a bit differently + + movbig( n0, #0x5812, #0x631a, #0x5cf5, #0xd3ed) + movbig( n1, #0x14de, #0xf9de, #0xa2f7, #0x9cd6) + +// Load the input number + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Get the quotient estimate q = floor(x/2^252). +// Also delete it from d3, in effect doing x' = x - q * 2^252 + + lsr q, d3, #60 + and d3, d3, #0x0FFFFFFFFFFFFFFF + +// Multiply [m2;m1;m0] = q * [n1;n0] + + mul m0, n0, q + mul m1, n1, q + umulh m2, n0, q + adds m1, m1, m2 + umulh m2, n1, q + adc m2, m2, xzr + +// Subtract [d3;d2;d1;d0] = x' - q * [n1;n0] = x - q * n_25519 + + subs d0, d0, m0 + sbcs d1, d1, m1 + sbcs d2, d2, m2 + sbcs d3, d3, xzr + +// If this borrows (CF = 0 because of inversion), add back n_25519. +// The masked n3 digit exploits the fact that bit 60 of n0 is set. + + csel n0, n0, xzr, cc + csel n1, n1, xzr, cc + + adds d0, d0, n0 + adcs d1, d1, n1 + and n0, n0, #0x1000000000000000 + adcs d2, d2, xzr + adc d3, d3, n0 + +// Store the end result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_p25519_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_p25519_4.S new file mode 100644 index 00000000000..4502ef480d1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mod_p25519_4.S @@ -0,0 +1,72 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_25519 +// Input x[4]; output z[4] +// +// extern void bignum_mod_p25519_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p25519_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p25519_4) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define q x6 +#define c x7 + +S2N_BN_SYMBOL(bignum_mod_p25519_4): + +// Load the inputs as [d3;d2;d1;d0] + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Letting x = 2^255 * h + l where h is the top bit, the provisional quotient +// is q = h + 1, which is either correct or 1 too high. + + mov c, #19 + lsr q, d3, #63 + madd q, c, q, c + +// Writing the provisional remainder as r = x - (2^255 - 19) * q we +// compute r' = (2^255 + l) + 19 * q = r + 2^256 + + adds d0, d0, q + adcs d1, d1, xzr + adcs d2, d2, xzr + orr d3, d3, #0x8000000000000000 + adcs d3, d3, xzr + +// Now r < 0 <=> r' < 2^256 <=> ~CF and in this case we correct by adding +// 2^255 - 19, or in fact subtracting 19 and masking to 255 bits. + + csel q, c, xzr, cc + subs d0, d0, q + sbcs d1, d1, xzr + sbcs d2, d2, xzr + sbc d3, d3, xzr + and d3, d3, #~0x8000000000000000 + +// Store the end result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mul_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mul_p25519.S new file mode 100644 index 00000000000..2855e2ddb90 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mul_p25519.S @@ -0,0 +1,334 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply modulo p_25519, z := (x * y) mod p_25519 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_mul_p25519 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p25519) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 + +#define a0 x3 +#define a0short w3 +#define a1 x4 +#define b0 x5 +#define b0short w5 +#define b1 x6 + +#define u0 x7 +#define u1 x8 +#define u2 x9 +#define u3 x10 +#define u4 x11 +#define u5 x12 +#define u6 x13 +#define u7 x14 + +#define u0short w7 +#define u1short w8 +#define u2short w9 +#define u3short w10 +#define u4short w11 +#define u5short w12 +#define u6short w13 +#define u7short w14 + +#define t x15 + +#define sgn x16 +#define ysgn x17 + +// These are aliases to registers used elsewhere including input pointers. +// By the time they are used this does not conflict with other uses. + +#define m0 y +#define m1 ysgn +#define m2 t +#define m3 x +#define u u2 + +// For the reduction stages, again aliasing other things but not the u's + +#define c x3 +#define cshort w3 +#define h x4 +#define l x5 +#define lshort w5 +#define d x6 +#define q x17 +#define qshort w17 + +S2N_BN_SYMBOL(bignum_mul_p25519): + +// Multiply the low halves using Karatsuba 2x2->4 to get [u3,u2,u1,u0] + + ldp a0, a1, [x] + ldp b0, b1, [y] + + umull u0, a0short, b0short + lsr x17, a0, #32 + umull x15, w17, b0short + lsr x16, b0, #32 + umull u1, w16, w17 + umull x16, a0short, w16 + adds u0, u0, x15, lsl #32 + lsr x15, x15, #32 + adc u1, u1, x15 + adds u0, u0, x16, lsl #32 + lsr x16, x16, #32 + adc u1, u1, x16 + + mul u2, a1, b1 + umulh u3, a1, b1 + + subs a1, a1, a0 + cneg a1, a1, cc + csetm sgn, cc + + adds u2, u2, u1 + adc u3, u3, xzr + + subs a0, b0, b1 + cneg a0, a0, cc + cinv sgn, sgn, cc + + mul t, a1, a0 + umulh a0, a1, a0 + + adds u1, u0, u2 + adcs u2, u2, u3 + adc u3, u3, xzr + + adds xzr, sgn, #1 + eor t, t, sgn + adcs u1, t, u1 + eor a0, a0, sgn + adcs u2, a0, u2 + adc u3, u3, sgn + +// Multiply the high halves using Karatsuba 2x2->4 to get [u7,u6,u5,u4] + + ldp a0, a1, [x, #16] + ldp b0, b1, [y, #16] + + umull u4, a0short, b0short + lsr x17, a0, #32 + umull x15, w17, b0short + lsr x16, b0, #32 + umull u5, w16, w17 + umull x16, a0short, w16 + adds u4, u4, x15, lsl #32 + lsr x15, x15, #32 + adc u5, u5, x15 + adds u4, u4, x16, lsl #32 + lsr x16, x16, #32 + adc u5, u5, x16 + + mul u6, a1, b1 + umulh u7, a1, b1 + + subs a1, a1, a0 + cneg a1, a1, cc + csetm sgn, cc + + adds u6, u6, u5 + adc u7, u7, xzr + + subs a0, b0, b1 + cneg a0, a0, cc + cinv sgn, sgn, cc + + mul t, a1, a0 + umulh a0, a1, a0 + + adds u5, u4, u6 + adcs u6, u6, u7 + adc u7, u7, xzr + + adds xzr, sgn, #1 + eor t, t, sgn + adcs u5, t, u5 + eor a0, a0, sgn + adcs u6, a0, u6 + adc u7, u7, sgn + +// Compute sgn,[a1,a0] = x_hi - x_lo +// and ysgn,[b1,b0] = y_lo - y_hi +// sign-magnitude differences + + ldp a0, a1, [x, #16] + ldp t, sgn, [x] + subs a0, a0, t + sbcs a1, a1, sgn + csetm sgn, cc + + ldp t, ysgn, [y] + subs b0, t, b0 + sbcs b1, ysgn, b1 + csetm ysgn, cc + + eor a0, a0, sgn + subs a0, a0, sgn + eor a1, a1, sgn + sbc a1, a1, sgn + + eor b0, b0, ysgn + subs b0, b0, ysgn + eor b1, b1, ysgn + sbc b1, b1, ysgn + +// Save the correct sign for the sub-product + + eor sgn, ysgn, sgn + +// Add H' = H + L_top, still in [u7,u6,u5,u4] + + adds u4, u4, u2 + adcs u5, u5, u3 + adcs u6, u6, xzr + adc u7, u7, xzr + +// Now compute the mid-product as [m3,m2,m1,m0] + + mul m0, a0, b0 + umulh m1, a0, b0 + mul m2, a1, b1 + umulh m3, a1, b1 + + subs a1, a1, a0 + cneg a1, a1, cc + csetm u, cc + + adds m2, m2, m1 + adc m3, m3, xzr + + subs b1, b0, b1 + cneg b1, b1, cc + cinv u, u, cc + + mul b0, a1, b1 + umulh b1, a1, b1 + + adds m1, m0, m2 + adcs m2, m2, m3 + adc m3, m3, xzr + + adds xzr, u, #1 + eor b0, b0, u + adcs m1, b0, m1 + eor b1, b1, u + adcs m2, b1, m2 + adc m3, m3, u + +// Accumulate the positive mid-terms as [u7,u6,u5,u4,u3,u2] + + adds u2, u4, u0 + adcs u3, u5, u1 + adcs u4, u6, u4 + adcs u5, u7, u5 + adcs u6, u6, xzr + adc u7, u7, xzr + +// Add in the sign-adjusted complex term + + adds xzr, sgn, #1 + eor m0, m0, sgn + adcs u2, m0, u2 + eor m1, m1, sgn + adcs u3, m1, u3 + eor m2, m2, sgn + adcs u4, m2, u4 + eor m3, m3, sgn + adcs u5, m3, u5 + adcs u6, u6, sgn + adc u7, u7, sgn + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0] +// and this is == 38 * h + l (mod p_25519). +// We do the 38 * h + l using 32-bit multiplies avoiding umulh, +// and pre-estimate and feed in the next-level quotient +// q = h + 1 where h = an early version of the high 255 bits. +// We add 2^255 * h - 19 * (h + 1), so end up offset by 2^255. + + mov c, #38 + + umull h, u4short, cshort + add h, h, u0short, uxtw + lsr u0, u0, #32 + lsr u4, u4, #32 + umaddl u4, u4short, cshort, u0 + mov u0, h + + umull h, u5short, cshort + add h, h, u1short, uxtw + lsr u1, u1, #32 + lsr u5, u5, #32 + umaddl u5, u5short, cshort, u1 + mov u1, h + + umull h, u6short, cshort + add h, h, u2short, uxtw + lsr u2, u2, #32 + lsr u6, u6, #32 + umaddl u6, u6short, cshort, u2 + mov u2, h + + umull h, u7short, cshort + add h, h, u3short, uxtw + lsr u3, u3, #32 + lsr u7, u7, #32 + umaddl u7, u7short, cshort, u3 + mov u3, h + + lsr q, u7, #31 + + mov l, #19 + umaddl l, lshort, qshort, l + add u0, u0, l + + adds u0, u0, u4, lsl #32 + extr c, u5, u4, #32 + adcs u1, u1, c + extr c, u6, u5, #32 + adcs u2, u2, c + extr c, u7, u6, #32 + lsl l, q, #63 + eor u3, u3, l + adc u3, u3, c + +// Now we correct by a final 2^255-19 if the top bit is clear +// meaning that the "real" pre-reduced result is negative. + + mov c, #19 + tst u3, #0x8000000000000000 + csel c, c, xzr, pl + subs u0, u0, c + sbcs u1, u1, xzr + sbcs u2, u2, xzr + sbc u3, u3, xzr + and u3, u3, #~0x8000000000000000 + +// Write back result + + stp u0, u1, [x0] + stp u2, u3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mul_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mul_p25519_alt.S new file mode 100644 index 00000000000..393069b6c79 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_mul_p25519_alt.S @@ -0,0 +1,203 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply modulo p_25519, z := (x * y) mod p_25519 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_mul_p25519_alt +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p25519_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p25519_alt) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 +#define b0 x7 +#define b1 x8 +#define b2 x9 +#define b3 x10 + +#define l x11 + +#define u0 x12 +#define u1 x13 +#define u2 x14 +#define u3 x15 +#define u4 x16 + +// These alias to the input arguments when no longer needed + +#define u5 a0 +#define u6 a1 +#define u7 a2 + +#define c b0 +#define q b1 +#define h b2 + +S2N_BN_SYMBOL(bignum_mul_p25519_alt): + +// Load operands and set up row 0 = [u4;...;u0] = a0 * [b3;...;b0] + + ldp a0, a1, [x] + ldp b0, b1, [y] + + mul u0, a0, b0 + umulh u1, a0, b0 + mul l, a0, b1 + umulh u2, a0, b1 + adds u1, u1, l + + ldp b2, b3, [y, #16] + + mul l, a0, b2 + umulh u3, a0, b2 + adcs u2, u2, l + + mul l, a0, b3 + umulh u4, a0, b3 + adcs u3, u3, l + adc u4, u4, xzr + + ldp a2, a3, [x, #16] + +// Row 1 = [u5;...;u0] = [a1;a0] * [b3;...;b0] + + mul l, a1, b0 + adds u1, u1, l + mul l, a1, b1 + adcs u2, u2, l + mul l, a1, b2 + adcs u3, u3, l + mul l, a1, b3 + adcs u4, u4, l + umulh u5, a1, b3 + adc u5, u5, xzr + + umulh l, a1, b0 + adds u2, u2, l + umulh l, a1, b1 + adcs u3, u3, l + umulh l, a1, b2 + adcs u4, u4, l + adc u5, u5, xzr + +// Row 2 = [u6;...;u0] = [a2;a1;a0] * [b3;...;b0] + + mul l, a2, b0 + adds u2, u2, l + mul l, a2, b1 + adcs u3, u3, l + mul l, a2, b2 + adcs u4, u4, l + mul l, a2, b3 + adcs u5, u5, l + umulh u6, a2, b3 + adc u6, u6, xzr + + umulh l, a2, b0 + adds u3, u3, l + umulh l, a2, b1 + adcs u4, u4, l + umulh l, a2, b2 + adcs u5, u5, l + adc u6, u6, xzr + +// Row 3 = [u7;...;u0] = [a3;...a0] * [b3;...;b0] + + mul l, a3, b0 + adds u3, u3, l + mul l, a3, b1 + adcs u4, u4, l + mul l, a3, b2 + adcs u5, u5, l + mul l, a3, b3 + adcs u6, u6, l + umulh u7, a3, b3 + adc u7, u7, xzr + + umulh l, a3, b0 + adds u4, u4, l + umulh l, a3, b1 + adcs u5, u5, l + umulh l, a3, b2 + adcs u6, u6, l + adc u7, u7, xzr + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0] +// and this is == 38 * h + l (mod p_25519) + + mov c, #38 + + mul l, c, u4 + umulh h, c, u4 + adds u0, u0, l + + mul l, c, u5 + umulh u5, c, u5 + adcs u1, u1, l + + mul l, c, u6 + umulh u6, c, u6 + adcs u2, u2, l + + mul l, c, u7 + umulh u7, c, u7 + adcs u3, u3, l + cset u4, cs + +// Compute the top part deferring the [u5,h] addition till the following +// carry chain. This is enough to get a good quotient estimate and saves +// a couple of instructions. + + adds u3, u3, u6 + adc u4, u4, u7 + +// Now we have reduced to 5 digits, 2^255 * H + L = [u4,u3,u2,u1,u0] +// Use q = H + 1 as the initial quotient estimate, either right or 1 too big. + + adds xzr, u3, u3 + orr u3, u3, #0x8000000000000000 + adc q, u4, u4 + mov c, #19 + madd l, c, q, c + adds u0, u0, l + adcs u1, u1, h + adcs u2, u2, u5 + adcs u3, u3, xzr + +// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0] +// So we correct if CF = 0 by subtracting 19, either way masking to +// 255 bits, i.e. by effectively adding p_25519 to the "full" answer + + csel c, c, xzr, cc + subs u0, u0, c + sbcs u1, u1, xzr + sbcs u2, u2, xzr + sbc u3, u3, xzr + bic u3, u3, #0x8000000000000000 + +// Write back and return + + stp u0, u1, [x0] + stp u2, u3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/curve25519/bignum_neg_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_neg_p25519.S similarity index 100% rename from third_party/s2n-bignum/arm/curve25519/bignum_neg_p25519.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_neg_p25519.S diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_optneg_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_optneg_p25519.S new file mode 100644 index 00000000000..026dbe47813 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_optneg_p25519.S @@ -0,0 +1,75 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or +// z := x (if p zero), assuming x reduced +// Inputs p, x[4]; output z[4] +// +// extern void bignum_optneg_p25519 +// (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = p, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_p25519) + .text + .balign 4 + +#define z x0 +#define p x1 +#define x x2 + +#define d0 x3 +#define d1 x4 +#define d2 x5 +#define d3 x6 +#define n0 x7 +#define n1 x8 +#define n2 x9 +#define n3 x10 + +#define c x11 + +S2N_BN_SYMBOL(bignum_optneg_p25519): + +// Load the digits of x as [d3;d2;d1;d0] and compute c = the OR of those digits +// as well as its proto-negation [n3;n2;n1;n0] = (2^255 - 19) - x + + ldp d0, d1, [x] + mov n0, #-19 + orr c, d0, d1 + subs n0, n0, d0 + mov n2, #-1 + sbcs n1, n2, d1 + ldp d2, d3, [x, #16] + orr c, c, d2 + sbcs n2, n2, d2 + mov n3, #0x7FFFFFFFFFFFFFFF + orr c, c, d3 + sbc n3, n3, d3 + +// Now we return just x if p = 0 or if x = 0 (to avoid giving -0 = p, which +// is not strictly reduced even though it's correct modulo p). The conditional +// comparison uses immediate 4 which means ZF. + + cmp p, xzr + ccmp c, xzr, #4, ne + + csel d0, n0, d0, ne + csel d1, n1, d1, ne + csel d2, n2, d2, ne + csel d3, n3, d3, ne + +// Write back result and return + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqr_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqr_p25519.S new file mode 100644 index 00000000000..1bcb3aea6ab --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqr_p25519.S @@ -0,0 +1,230 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square modulo p_25519, z := (x^2) mod p_25519 +// Input x[4]; output z[4] +// +// extern void bignum_sqr_p25519 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p25519) + .text + .balign 4 + +#define z x0 +#define x x1 + +// Variables + +#define u0 x2 +#define u1 x3 +#define u2 x4 +#define u3 x5 +#define u4 x6 +#define u5 x7 +#define u6 x8 +#define u7 x9 + +#define u0short w2 +#define u1short w3 +#define u2short w4 +#define u3short w5 +#define u4short w6 +#define u5short w7 +#define u6short w8 +#define u7short w9 + +#define c x10 +#define cshort w10 +#define l x11 +#define lshort w11 +#define h x12 +#define hshort w12 +#define q x13 +#define qshort w13 + +#define t1 x14 +#define t1short w14 +#define t2 x15 +#define t2short w15 +#define t3 x16 +#define t3short w16 + +S2N_BN_SYMBOL(bignum_sqr_p25519): + +// First just a near-clone of bignum_sqr_4_8 to get the square, using +// different registers to collect full product without writeback. + + ldp c, l, [x] + ldp h, q, [x, #16] + umull u0, cshort, cshort + lsr t1, c, #32 + umull u1, t1short, t1short + umull t1, cshort, t1short + adds u0, u0, t1, lsl #33 + lsr t1, t1, #31 + adc u1, u1, t1 + umull u2, lshort, lshort + lsr t1, l, #32 + umull u3, t1short, t1short + umull t1, lshort, t1short + mul t2, c, l + umulh t3, c, l + adds u2, u2, t1, lsl #33 + lsr t1, t1, #31 + adc u3, u3, t1 + adds t2, t2, t2 + adcs t3, t3, t3 + adc u3, u3, xzr + adds u1, u1, t2 + adcs u2, u2, t3 + adc u3, u3, xzr + umull u4, hshort, hshort + lsr t1, h, #32 + umull u5, t1short, t1short + umull t1, hshort, t1short + adds u4, u4, t1, lsl #33 + lsr t1, t1, #31 + adc u5, u5, t1 + umull u6, qshort, qshort + lsr t1, q, #32 + umull u7, t1short, t1short + umull t1, qshort, t1short + mul t2, h, q + umulh t3, h, q + adds u6, u6, t1, lsl #33 + lsr t1, t1, #31 + adc u7, u7, t1 + adds t2, t2, t2 + adcs t3, t3, t3 + adc u7, u7, xzr + adds u5, u5, t2 + adcs u6, u6, t3 + adc u7, u7, xzr + subs c, c, h + sbcs l, l, q + csetm t3, cc + eor c, c, t3 + subs c, c, t3 + eor l, l, t3 + sbc l, l, t3 + adds u4, u4, u2 + adcs u5, u5, u3 + adcs u6, u6, xzr + adc u7, u7, xzr + umull h, cshort, cshort + lsr u3, c, #32 + umull q, u3short, u3short + umull u3, cshort, u3short + adds h, h, u3, lsl #33 + lsr u3, u3, #31 + adc q, q, u3 + umull t2, lshort, lshort + lsr u3, l, #32 + umull t1, u3short, u3short + umull u3, lshort, u3short + mul u2, c, l + umulh t3, c, l + adds t2, t2, u3, lsl #33 + lsr u3, u3, #31 + adc t1, t1, u3 + adds u2, u2, u2 + adcs t3, t3, t3 + adc t1, t1, xzr + adds q, q, u2 + adcs t2, t2, t3 + adc t1, t1, xzr + adds u2, u0, u4 + adcs u3, u1, u5 + adcs u4, u4, u6 + adcs u5, u5, u7 + csetm t3, cc + subs u2, u2, h + sbcs u3, u3, q + sbcs u4, u4, t2 + sbcs u5, u5, t1 + adcs u6, u6, t3 + adc u7, u7, t3 + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0] +// and this is == 38 * h + l (mod p_25519). +// We do the 38 * h + l using 32-bit multiplies avoiding umulh, +// and pre-estimate and feed in the next-level quotient +// q = h + 1 where h = an early version of the high 255 bits. +// We add 2^255 * h - 19 * (h + 1), so end up offset by 2^255. + + mov c, #38 + + umull h, u4short, cshort + add h, h, u0short, uxtw + lsr u0, u0, #32 + lsr u4, u4, #32 + umaddl u4, u4short, cshort, u0 + mov u0, h + + umull h, u5short, cshort + add h, h, u1short, uxtw + lsr u1, u1, #32 + lsr u5, u5, #32 + umaddl u5, u5short, cshort, u1 + mov u1, h + + umull h, u6short, cshort + add h, h, u2short, uxtw + lsr u2, u2, #32 + lsr u6, u6, #32 + umaddl u6, u6short, cshort, u2 + mov u2, h + + umull h, u7short, cshort + add h, h, u3short, uxtw + lsr u3, u3, #32 + lsr u7, u7, #32 + umaddl u7, u7short, cshort, u3 + mov u3, h + + lsr q, u7, #31 + + mov l, #19 + umaddl l, lshort, qshort, l + add u0, u0, l + + adds u0, u0, u4, lsl #32 + extr c, u5, u4, #32 + adcs u1, u1, c + extr c, u6, u5, #32 + adcs u2, u2, c + extr c, u7, u6, #32 + lsl l, q, #63 + eor u3, u3, l + adc u3, u3, c + +// Now we correct by a final 2^255-19 if the top bit is clear +// meaning that the "real" pre-reduced result is negative. + + mov c, #19 + tst u3, #0x8000000000000000 + csel c, c, xzr, pl + subs u0, u0, c + sbcs u1, u1, xzr + sbcs u2, u2, xzr + sbc u3, u3, xzr + and u3, u3, #~0x8000000000000000 + +// Write back result + + stp u0, u1, [x0] + stp u2, u3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqr_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqr_p25519_alt.S new file mode 100644 index 00000000000..4941076a97c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqr_p25519_alt.S @@ -0,0 +1,178 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square modulo p_25519, z := (x^2) mod p_25519 +// Input x[4]; output z[4] +// +// extern void bignum_sqr_p25519_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p25519_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p25519_alt) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 + +#define h x6 +#define l x7 + +#define u0 x8 +#define u1 x9 +#define u2 x10 +#define u3 x11 +#define u4 x12 +#define u5 x13 +#define u6 x14 + +// Just aliases + +#define q a0 +#define c a1 +#define t a2 +#define u7 h + +S2N_BN_SYMBOL(bignum_sqr_p25519_alt): + +// Load all the elements, set up an initial window [u6;...u1] = [23;03;01] +// and chain in the addition of 02 + 12 + 13 (no carry-out is possible). +// This gives all the "heterogeneous" terms of the squaring ready to double + + ldp a0, a1, [x] + + mul u1, a0, a1 + umulh u2, a0, a1 + + ldp a2, a3, [x, #16] + + mul u3, a0, a3 + umulh u4, a0, a3 + + mul l, a0, a2 + umulh h, a0, a2 + adds u2, u2, l + + adcs u3, u3, h + mul l, a1, a2 + umulh h, a1, a2 + adc h, h, xzr + adds u3, u3, l + + mul u5, a2, a3 + umulh u6, a2, a3 + + adcs u4, u4, h + mul l, a1, a3 + umulh h, a1, a3 + adc h, h, xzr + adds u4, u4, l + + adcs u5, u5, h + adc u6, u6, xzr + +// Now just double it; this simple approach seems to work better than extr + + adds u1, u1, u1 + adcs u2, u2, u2 + adcs u3, u3, u3 + adcs u4, u4, u4 + adcs u5, u5, u5 + adcs u6, u6, u6 + cset u7, cs + +// Add the homogeneous terms 00 + 11 + 22 + 33 + + umulh l, a0, a0 + mul u0, a0, a0 + adds u1, u1, l + + mul l, a1, a1 + adcs u2, u2, l + umulh l, a1, a1 + adcs u3, u3, l + + mul l, a2, a2 + adcs u4, u4, l + umulh l, a2, a2 + adcs u5, u5, l + + mul l, a3, a3 + adcs u6, u6, l + umulh l, a3, a3 + adc u7, u7, l + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0] +// and this is == 38 * h + l (mod p_25519) + + mov c, #38 + + mul l, c, u4 + umulh t, c, u4 + adds u0, u0, l + + mul l, c, u5 + umulh u5, c, u5 + adcs u1, u1, l + + mul l, c, u6 + umulh u6, c, u6 + adcs u2, u2, l + + mul l, c, u7 + umulh u7, c, u7 + adcs u3, u3, l + cset u4, cs + +// Compute the top part deferring the [u5,t] addition till the following +// carry chain. This is enough to get a good quotient estimate and saves +// a couple of instructions. + + adds u3, u3, u6 + adc u4, u4, u7 + +// Now we have reduced to 5 digits, 2^255 * H + L = [u4,u3,u2,u1,u0] +// Use q = H + 1 as the initial quotient estimate, either right or 1 too big. + + adds xzr, u3, u3 + orr u3, u3, #0x8000000000000000 + adc q, u4, u4 + mov c, #19 + madd l, c, q, c + adds u0, u0, l + adcs u1, u1, t + adcs u2, u2, u5 + adcs u3, u3, xzr + +// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0] +// So we correct if CF = 0 by subtracting 19, either way masking to +// 255 bits, i.e. by effectively adding p_25519 to the "full" answer + + csel c, c, xzr, cc + subs u0, u0, c + sbcs u1, u1, xzr + sbcs u2, u2, xzr + sbc u3, u3, xzr + bic u3, u3, #0x8000000000000000 + +// Write back and return + + stp u0, u1, [x0] + stp u2, u3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqrt_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqrt_p25519.S new file mode 100644 index 00000000000..da80e48a428 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqrt_p25519.S @@ -0,0 +1,610 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square root modulo p_25519 = 2^255 - 19 +// Input x[4]; output function return (Legendre symbol) and z[4] +// +// extern int64_t bignum_sqrt_p25519(uint64_t z[static 4],uint64_t x[static 4]); +// +// Given a 4-digit input x, returns a modular square root mod p_25519, i.e. +// a z such that z^2 == x (mod p_25519), whenever one exists. The square +// root z is chosen so that its LSB is even (note that p_25519 - z is +// another square root). The function return is the Legendre/Jacobi symbol +// (x//p_25519), which indicates whether indeed x has a modular square root +// and hence whether the result is meaningful: +// +// 0: x is divisible by p_25519 and z is the square root 0 +// +1: x is coprime to p_25519 and z is a square root +// -1: x is coprime to p_25519 but not a quadratic residue +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqrt_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqrt_p25519) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define a sp, #0 +#define b sp, #(4*N) +#define s sp, #(8*N) +#define t sp, #(12*N) + +// Other temporary variables in register + +#define res x19 + +// Total size to reserve on the stack + +#define NSPACE #(16*N) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +// Macros wrapping up calls to the local subroutines + +#define mulp(dest,src1,src2) \ + add x0, dest __LF \ + add x1, src1 __LF \ + add x2, src2 __LF \ + bl bignum_sqrt_p25519_mul_p25519 + +#define nsqr(dest,n,src) \ + add x0, dest __LF \ + mov x1, n __LF \ + add x2, src __LF \ + bl bignum_sqrt_p25519_nsqr_p25519 + +S2N_BN_SYMBOL(bignum_sqrt_p25519): + +// Save registers and make room for temporaries + + stp x19, x30, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Set up reduced version of the input argument a = x mod p_25519. Then +// get the candidate square root s = a^{252-2} + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #19 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, lo + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [a] + stp x4, x5, [a+16] + + // Power 2^2 - 1 = 3 + + nsqr(t,1,a) + mulp(t,t,a) + + // Power 2^4 - 1 = 15 + + nsqr(s,2,t) + mulp(t,s,t) + + // Power 2^5 - 1 = 31 + + nsqr(s,1,t) + mulp(b,s,a) + + // Power 2^10 - 1 + + nsqr(s,5,b) + mulp(t,s,b) + + // Power 2^20 - 1 + + nsqr(s,10,t) + mulp(t,s,t) + + // Power 2^25 - 1 + + nsqr(s,5,t) + mulp(b,s,b) + + // Power 2^50 - 1 + + nsqr(s,25,b) + mulp(t,s,b) + + // Power 2^100 - 1 + nsqr(s,50,t) + mulp(t,s,t) + + // Power 2^125 - 1 + + nsqr(s,25,t) + mulp(b,s,b) + + // Power 2^250 - 1 + + nsqr(s,125,b) + mulp(b,s,b) + + // Power 2^251 - 1 + + nsqr(s,1,b) + mulp(t,s,a) + + // Power 2^252 - 2 + + nsqr(s,1,t) + +// s is now one candidate square root. Generate the other one t = s * j_25519 + + movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0) + movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478) + movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7) + movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b) + stp x0, x1, [t] + stp x2, x3, [t+16] + mulp(t,s,t) + +// Now multiplex between them according to whether s^2 = a + + nsqr(b,1,s) + + ldp x10, x11, [a] + ldp x14, x15, [b] + eor x10, x10, x14 + eor x11, x11, x15 + orr x10, x10, x11 + ldp x12, x13, [a+16] + ldp x16, x17, [b+16] + eor x12, x12, x16 + eor x13, x13, x17 + orr x12, x12, x13 + orr x10, x10, x12 + cmp x10, xzr + + ldp x10, x11, [s] + ldp x14, x15, [t] + csel x10, x10, x14, eq + csel x11, x11, x15, eq + ldp x12, x13, [s+16] + ldp x16, x17, [t+16] + csel x12, x12, x16, eq + csel x13, x13, x17, eq + +// For definiteness, choose "positive" (LSB=0) square root + + mov x14, #-19 + subs x14, x14, x10 + mov x16, #-1 + sbcs x15, x16, x11 + mov x17, #0x7FFFFFFFFFFFFFFF + sbcs x16, x16, x12 + sbc x17, x17, x13 + + tst x10, #1 + csel x10, x10, x14, eq + csel x11, x11, x15, eq + csel x12, x12, x16, eq + csel x13, x13, x17, eq + + mov x2, res + stp x10, x11, [x2] + stp x12, x13, [x2, #16] + +// Determine if it is is indeed a square root and also if a = 0 +// Hence return the Legendre-Jacobi symbol as required. + + add x0, b + mov x1, #1 + bl bignum_sqrt_p25519_nsqr_p25519 + + ldp x10, x11, [a] + ldp x14, x15, [b] + eor x14, x10, x14 + eor x15, x11, x15 + orr x14, x14, x15 + ldp x12, x13, [a+16] + ldp x16, x17, [b+16] + eor x16, x12, x16 + eor x17, x13, x17 + orr x16, x16, x17 + orr x14, x14, x16 + cmp x14, xzr + mov x0, #1 + cneg x0, x0, ne + + orr x10, x10, x11 + orr x12, x12, x13 + orr x10, x10, x12 + cmp x10, xzr + csel x0, x0, xzr, ne + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x19, x30, [sp], 16 + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +bignum_sqrt_p25519_mul_p25519: + ldp x3, x4, [x1] + ldp x5, x6, [x2] + umull x7, w3, w5 + lsr x17, x3, #32 + umull x15, w17, w5 + lsr x16, x5, #32 + umull x8, w16, w17 + umull x16, w3, w16 + adds x7, x7, x15, lsl #32 + lsr x15, x15, #32 + adc x8, x8, x15 + adds x7, x7, x16, lsl #32 + lsr x16, x16, #32 + adc x8, x8, x16 + mul x9, x4, x6 + umulh x10, x4, x6 + subs x4, x4, x3 + cneg x4, x4, lo + csetm x16, lo + adds x9, x9, x8 + adc x10, x10, xzr + subs x3, x5, x6 + cneg x3, x3, lo + cinv x16, x16, lo + mul x15, x4, x3 + umulh x3, x4, x3 + adds x8, x7, x9 + adcs x9, x9, x10 + adc x10, x10, xzr + cmn x16, #1 + eor x15, x15, x16 + adcs x8, x15, x8 + eor x3, x3, x16 + adcs x9, x3, x9 + adc x10, x10, x16 + ldp x3, x4, [x1, #16] + ldp x5, x6, [x2, #16] + umull x11, w3, w5 + lsr x17, x3, #32 + umull x15, w17, w5 + lsr x16, x5, #32 + umull x12, w16, w17 + umull x16, w3, w16 + adds x11, x11, x15, lsl #32 + lsr x15, x15, #32 + adc x12, x12, x15 + adds x11, x11, x16, lsl #32 + lsr x16, x16, #32 + adc x12, x12, x16 + mul x13, x4, x6 + umulh x14, x4, x6 + subs x4, x4, x3 + cneg x4, x4, lo + csetm x16, lo + adds x13, x13, x12 + adc x14, x14, xzr + subs x3, x5, x6 + cneg x3, x3, lo + cinv x16, x16, lo + mul x15, x4, x3 + umulh x3, x4, x3 + adds x12, x11, x13 + adcs x13, x13, x14 + adc x14, x14, xzr + cmn x16, #1 + eor x15, x15, x16 + adcs x12, x15, x12 + eor x3, x3, x16 + adcs x13, x3, x13 + adc x14, x14, x16 + ldp x3, x4, [x1, #16] + ldp x15, x16, [x1] + subs x3, x3, x15 + sbcs x4, x4, x16 + csetm x16, lo + ldp x15, x17, [x2] + subs x5, x15, x5 + sbcs x6, x17, x6 + csetm x17, lo + eor x3, x3, x16 + subs x3, x3, x16 + eor x4, x4, x16 + sbc x4, x4, x16 + eor x5, x5, x17 + subs x5, x5, x17 + eor x6, x6, x17 + sbc x6, x6, x17 + eor x16, x17, x16 + adds x11, x11, x9 + adcs x12, x12, x10 + adcs x13, x13, xzr + adc x14, x14, xzr + mul x2, x3, x5 + umulh x17, x3, x5 + mul x15, x4, x6 + umulh x1, x4, x6 + subs x4, x4, x3 + cneg x4, x4, lo + csetm x9, lo + adds x15, x15, x17 + adc x1, x1, xzr + subs x6, x5, x6 + cneg x6, x6, lo + cinv x9, x9, lo + mul x5, x4, x6 + umulh x6, x4, x6 + adds x17, x2, x15 + adcs x15, x15, x1 + adc x1, x1, xzr + cmn x9, #1 + eor x5, x5, x9 + adcs x17, x5, x17 + eor x6, x6, x9 + adcs x15, x6, x15 + adc x1, x1, x9 + adds x9, x11, x7 + adcs x10, x12, x8 + adcs x11, x13, x11 + adcs x12, x14, x12 + adcs x13, x13, xzr + adc x14, x14, xzr + cmn x16, #1 + eor x2, x2, x16 + adcs x9, x2, x9 + eor x17, x17, x16 + adcs x10, x17, x10 + eor x15, x15, x16 + adcs x11, x15, x11 + eor x1, x1, x16 + adcs x12, x1, x12 + adcs x13, x13, x16 + adc x14, x14, x16 + mov x3, #38 + umull x4, w11, w3 + add x4, x4, w7, uxtw + lsr x7, x7, #32 + lsr x11, x11, #32 + umaddl x11, w11, w3, x7 + mov x7, x4 + umull x4, w12, w3 + add x4, x4, w8, uxtw + lsr x8, x8, #32 + lsr x12, x12, #32 + umaddl x12, w12, w3, x8 + mov x8, x4 + umull x4, w13, w3 + add x4, x4, w9, uxtw + lsr x9, x9, #32 + lsr x13, x13, #32 + umaddl x13, w13, w3, x9 + mov x9, x4 + umull x4, w14, w3 + add x4, x4, w10, uxtw + lsr x10, x10, #32 + lsr x14, x14, #32 + umaddl x14, w14, w3, x10 + mov x10, x4 + lsr x17, x14, #31 + mov x5, #19 + umaddl x5, w5, w17, x5 + add x7, x7, x5 + adds x7, x7, x11, lsl #32 + extr x3, x12, x11, #32 + adcs x8, x8, x3 + extr x3, x13, x12, #32 + adcs x9, x9, x3 + extr x3, x14, x13, #32 + lsl x5, x17, #63 + eor x10, x10, x5 + adc x10, x10, x3 + mov x3, #19 + tst x10, #0x8000000000000000 + csel x3, x3, xzr, pl + subs x7, x7, x3 + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbc x10, x10, xzr + and x10, x10, #0x7fffffffffffffff + stp x7, x8, [x0] + stp x9, x10, [x0, #16] + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +bignum_sqrt_p25519_nsqr_p25519: + +// Copy input argument into [x13;x12;x11;x10] + + ldp x10, x11, [x2] + ldp x12, x13, [x2, #16] + +// Main squaring loop, accumulating in [x13;x12;x11;x10] consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +bignum_sqrt_p25519_loop: + umull x2, w10, w10 + lsr x14, x10, #32 + umull x3, w14, w14 + umull x14, w10, w14 + adds x2, x2, x14, lsl #33 + lsr x14, x14, #31 + adc x3, x3, x14 + umull x4, w11, w11 + lsr x14, x11, #32 + umull x5, w14, w14 + umull x14, w11, w14 + mul x15, x10, x11 + umulh x16, x10, x11 + adds x4, x4, x14, lsl #33 + lsr x14, x14, #31 + adc x5, x5, x14 + adds x15, x15, x15 + adcs x16, x16, x16 + adc x5, x5, xzr + adds x3, x3, x15 + adcs x4, x4, x16 + adc x5, x5, xzr + umull x6, w12, w12 + lsr x14, x12, #32 + umull x7, w14, w14 + umull x14, w12, w14 + adds x6, x6, x14, lsl #33 + lsr x14, x14, #31 + adc x7, x7, x14 + umull x8, w13, w13 + lsr x14, x13, #32 + umull x9, w14, w14 + umull x14, w13, w14 + mul x15, x12, x13 + umulh x16, x12, x13 + adds x8, x8, x14, lsl #33 + lsr x14, x14, #31 + adc x9, x9, x14 + adds x15, x15, x15 + adcs x16, x16, x16 + adc x9, x9, xzr + adds x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, xzr + subs x10, x10, x12 + sbcs x11, x11, x13 + csetm x16, lo + eor x10, x10, x16 + subs x10, x10, x16 + eor x11, x11, x16 + sbc x11, x11, x16 + adds x6, x6, x4 + adcs x7, x7, x5 + adcs x8, x8, xzr + adc x9, x9, xzr + umull x12, w10, w10 + lsr x5, x10, #32 + umull x13, w5, w5 + umull x5, w10, w5 + adds x12, x12, x5, lsl #33 + lsr x5, x5, #31 + adc x13, x13, x5 + umull x15, w11, w11 + lsr x5, x11, #32 + umull x14, w5, w5 + umull x5, w11, w5 + mul x4, x10, x11 + umulh x16, x10, x11 + adds x15, x15, x5, lsl #33 + lsr x5, x5, #31 + adc x14, x14, x5 + adds x4, x4, x4 + adcs x16, x16, x16 + adc x14, x14, xzr + adds x13, x13, x4 + adcs x15, x15, x16 + adc x14, x14, xzr + adds x4, x2, x6 + adcs x5, x3, x7 + adcs x6, x6, x8 + adcs x7, x7, x9 + csetm x16, lo + subs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x15 + sbcs x7, x7, x14 + adcs x8, x8, x16 + adc x9, x9, x16 + mov x10, #38 + umull x12, w6, w10 + add x12, x12, w2, uxtw + lsr x2, x2, #32 + lsr x6, x6, #32 + umaddl x6, w6, w10, x2 + mov x2, x12 + umull x12, w7, w10 + add x12, x12, w3, uxtw + lsr x3, x3, #32 + lsr x7, x7, #32 + umaddl x7, w7, w10, x3 + mov x3, x12 + umull x12, w8, w10 + add x12, x12, w4, uxtw + lsr x4, x4, #32 + lsr x8, x8, #32 + umaddl x8, w8, w10, x4 + mov x4, x12 + umull x12, w9, w10 + add x12, x12, w5, uxtw + lsr x5, x5, #32 + lsr x9, x9, #32 + umaddl x9, w9, w10, x5 + mov x5, x12 + lsr x13, x9, #31 + mov x11, #19 + umull x11, w11, w13 + add x2, x2, x11 + adds x10, x2, x6, lsl #32 + extr x12, x7, x6, #32 + adcs x11, x3, x12 + extr x12, x8, x7, #32 + adcs x12, x4, x12 + extr x14, x9, x8, #32 + lsl x15, x13, #63 + eor x5, x5, x15 + adc x13, x5, x14 + +// Loop as applicable + + subs x1, x1, #1 + bne bignum_sqrt_p25519_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "pl" condition. + + adds x6, x10, #19 + adcs x7, x11, xzr + adcs x8, x12, xzr + adcs x9, x13, xzr + + csel x10, x10, x6, pl + csel x11, x11, x7, pl + csel x12, x12, x8, pl + csel x13, x13, x9, pl + bic x13, x13, #0x8000000000000000 + +// Copy result back into destination and return + + stp x10, x11, [x0] + stp x12, x13, [x0, #16] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqrt_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqrt_p25519_alt.S new file mode 100644 index 00000000000..ac33ef9a160 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sqrt_p25519_alt.S @@ -0,0 +1,473 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square root modulo p_25519 = 2^255 - 19 +// Input x[4]; output function return (Legendre symbol) and z[4] +// +// extern int64_t bignum_sqrt_p25519_alt(uint64_t z[static 4],uint64_t x[static 4]); +// +// Given a 4-digit input x, returns a modular square root mod p_25519, i.e. +// a z such that z^2 == x (mod p_25519), whenever one exists. The square +// root z is chosen so that its LSB is even (note that p_25519 - z is +// another square root). The function return is the Legendre/Jacobi symbol +// (x//p_25519), which indicates whether indeed x has a modular square root +// and hence whether the result is meaningful: +// +// 0: x is divisible by p_25519 and z is the square root 0 +// +1: x is coprime to p_25519 and z is a square root +// -1: x is coprime to p_25519 but not a quadratic residue +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqrt_p25519_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqrt_p25519_alt) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define a sp, #0 +#define b sp, #(4*N) +#define s sp, #(8*N) +#define t sp, #(12*N) + +// Other temporary variables in register + +#define res x19 + +// Total size to reserve on the stack + +#define NSPACE #(16*N) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +// Macros wrapping up calls to the local subroutines + +#define mulp(dest,src1,src2) \ + add x0, dest __LF \ + add x1, src1 __LF \ + add x2, src2 __LF \ + bl bignum_sqrt_p25519_alt_mul_p25519 + +#define nsqr(dest,n,src) \ + add x0, dest __LF \ + mov x1, n __LF \ + add x2, src __LF \ + bl bignum_sqrt_p25519_alt_nsqr_p25519 + +S2N_BN_SYMBOL(bignum_sqrt_p25519_alt): + +// Save registers and make room for temporaries + + stp x19, x30, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Set up reduced version of the input argument a = x mod p_25519. Then +// get the candidate square root s = a^{252-2} + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x7, #19 + lsr x6, x5, #63 + madd x6, x7, x6, x7 + adds x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + orr x5, x5, #0x8000000000000000 + adcs x5, x5, xzr + csel x6, x7, xzr, lo + subs x2, x2, x6 + sbcs x3, x3, xzr + sbcs x4, x4, xzr + sbc x5, x5, xzr + and x5, x5, #0x7fffffffffffffff + stp x2, x3, [a] + stp x4, x5, [a+16] + + // Power 2^2 - 1 = 3 + + nsqr(t,1,a) + mulp(t,t,a) + + // Power 2^4 - 1 = 15 + + nsqr(s,2,t) + mulp(t,s,t) + + // Power 2^5 - 1 = 31 + + nsqr(s,1,t) + mulp(b,s,a) + + // Power 2^10 - 1 + + nsqr(s,5,b) + mulp(t,s,b) + + // Power 2^20 - 1 + + nsqr(s,10,t) + mulp(t,s,t) + + // Power 2^25 - 1 + + nsqr(s,5,t) + mulp(b,s,b) + + // Power 2^50 - 1 + + nsqr(s,25,b) + mulp(t,s,b) + + // Power 2^100 - 1 + nsqr(s,50,t) + mulp(t,s,t) + + // Power 2^125 - 1 + + nsqr(s,25,t) + mulp(b,s,b) + + // Power 2^250 - 1 + + nsqr(s,125,b) + mulp(b,s,b) + + // Power 2^251 - 1 + + nsqr(s,1,b) + mulp(t,s,a) + + // Power 2^252 - 2 + + nsqr(s,1,t) + +// s is now one candidate square root. Generate the other one t = s * j_25519 + + movbig(x0, #0xc4ee, #0x1b27, #0x4a0e, #0xa0b0) + movbig(x1, #0x2f43, #0x1806, #0xad2f, #0xe478) + movbig(x2, #0x2b4d, #0x0099, #0x3dfb, #0xd7a7) + movbig(x3, #0x2b83, #0x2480, #0x4fc1, #0xdf0b) + stp x0, x1, [t] + stp x2, x3, [t+16] + mulp(t,s,t) + +// Now multiplex between them according to whether s^2 = a + + nsqr(b,1,s) + + ldp x10, x11, [a] + ldp x14, x15, [b] + eor x10, x10, x14 + eor x11, x11, x15 + orr x10, x10, x11 + ldp x12, x13, [a+16] + ldp x16, x17, [b+16] + eor x12, x12, x16 + eor x13, x13, x17 + orr x12, x12, x13 + orr x10, x10, x12 + cmp x10, xzr + + ldp x10, x11, [s] + ldp x14, x15, [t] + csel x10, x10, x14, eq + csel x11, x11, x15, eq + ldp x12, x13, [s+16] + ldp x16, x17, [t+16] + csel x12, x12, x16, eq + csel x13, x13, x17, eq + +// For definiteness, choose "positive" (LSB=0) square root + + mov x14, #-19 + subs x14, x14, x10 + mov x16, #-1 + sbcs x15, x16, x11 + mov x17, #0x7FFFFFFFFFFFFFFF + sbcs x16, x16, x12 + sbc x17, x17, x13 + + tst x10, #1 + csel x10, x10, x14, eq + csel x11, x11, x15, eq + csel x12, x12, x16, eq + csel x13, x13, x17, eq + + mov x2, res + stp x10, x11, [x2] + stp x12, x13, [x2, #16] + +// Determine if it is is indeed a square root and also if a = 0 +// Hence return the Legendre-Jacobi symbol as required. + + add x0, b + mov x1, #1 + bl bignum_sqrt_p25519_alt_nsqr_p25519 + + ldp x10, x11, [a] + ldp x14, x15, [b] + eor x14, x10, x14 + eor x15, x11, x15 + orr x14, x14, x15 + ldp x12, x13, [a+16] + ldp x16, x17, [b+16] + eor x16, x12, x16 + eor x17, x13, x17 + orr x16, x16, x17 + orr x14, x14, x16 + cmp x14, xzr + mov x0, #1 + cneg x0, x0, ne + + orr x10, x10, x11 + orr x12, x12, x13 + orr x10, x10, x12 + cmp x10, xzr + csel x0, x0, xzr, ne + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x19, x30, [sp], 16 + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +bignum_sqrt_p25519_alt_mul_p25519: + ldp x3, x4, [x1] + ldp x7, x8, [x2] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x2, #16] + mul x11, x3, x9 + umulh x15, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x16, x3, x10 + adcs x15, x15, x11 + adc x16, x16, xzr + ldp x5, x6, [x1, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x15, x15, x11 + mul x11, x4, x10 + adcs x16, x16, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x15, x15, x11 + umulh x11, x4, x9 + adcs x16, x16, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x15, x15, x11 + mul x11, x5, x9 + adcs x16, x16, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x15, x15, x11 + umulh x11, x5, x8 + adcs x16, x16, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x15, x15, x11 + mul x11, x6, x8 + adcs x16, x16, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x16, x16, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + mov x7, #38 + mul x11, x7, x16 + umulh x9, x7, x16 + adds x12, x12, x11 + mul x11, x7, x3 + umulh x3, x7, x3 + adcs x13, x13, x11 + mul x11, x7, x4 + umulh x4, x7, x4 + adcs x14, x14, x11 + mul x11, x7, x5 + umulh x5, x7, x5 + adcs x15, x15, x11 + cset x16, hs + adds x15, x15, x4 + adc x16, x16, x5 + cmn x15, x15 + orr x15, x15, #0x8000000000000000 + adc x8, x16, x16 + mov x7, #19 + madd x11, x7, x8, x7 + adds x12, x12, x11 + adcs x13, x13, x9 + adcs x14, x14, x3 + adcs x15, x15, xzr + csel x7, x7, xzr, lo + subs x12, x12, x7 + sbcs x13, x13, xzr + sbcs x14, x14, xzr + sbc x15, x15, xzr + and x15, x15, #0x7fffffffffffffff + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +bignum_sqrt_p25519_alt_nsqr_p25519: + +// Copy input argument into [x5;x4;x3;x2] (overwriting input pointer x20 + + ldp x6, x3, [x2] + ldp x4, x5, [x2, #16] + mov x2, x6 + +// Main squaring loop, accumulating in [x5;x4;x3;x2] consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +bignum_sqrt_p25519_alt_loop: + mul x9, x2, x3 + umulh x10, x2, x3 + mul x11, x2, x5 + umulh x12, x2, x5 + mul x7, x2, x4 + umulh x6, x2, x4 + adds x10, x10, x7 + adcs x11, x11, x6 + mul x7, x3, x4 + umulh x6, x3, x4 + adc x6, x6, xzr + adds x11, x11, x7 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x6 + mul x7, x3, x5 + umulh x6, x3, x5 + adc x6, x6, xzr + adds x12, x12, x7 + adcs x13, x13, x6 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x6, hs + umulh x7, x2, x2 + mul x8, x2, x2 + adds x9, x9, x7 + mul x7, x3, x3 + adcs x10, x10, x7 + umulh x7, x3, x3 + adcs x11, x11, x7 + mul x7, x4, x4 + adcs x12, x12, x7 + umulh x7, x4, x4 + adcs x13, x13, x7 + mul x7, x5, x5 + adcs x14, x14, x7 + umulh x7, x5, x5 + adc x6, x6, x7 + mov x3, #38 + mul x7, x3, x12 + umulh x4, x3, x12 + adds x8, x8, x7 + mul x7, x3, x13 + umulh x13, x3, x13 + adcs x9, x9, x7 + mul x7, x3, x14 + umulh x14, x3, x14 + adcs x10, x10, x7 + mul x7, x3, x6 + umulh x6, x3, x6 + adcs x11, x11, x7 + cset x12, hs + adds x11, x11, x14 + adc x12, x12, x6 + cmn x11, x11 + bic x11, x11, #0x8000000000000000 + adc x2, x12, x12 + mov x3, #0x13 + mul x7, x3, x2 + adds x2, x8, x7 + adcs x3, x9, x4 + adcs x4, x10, x13 + adc x5, x11, xzr + +// Loop as applicable + + subs x1, x1, #1 + bne bignum_sqrt_p25519_alt_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "pl" condition. + + adds x6, x2, #19 + adcs x7, x3, xzr + adcs x8, x4, xzr + adcs x9, x5, xzr + + csel x2, x2, x6, pl + csel x3, x3, x7, pl + csel x4, x4, x8, pl + csel x5, x5, x9, pl + bic x5, x5, #0x8000000000000000 + +// Copy result back into destination and return + + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sub_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sub_p25519.S new file mode 100644 index 00000000000..001a2b45042 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/bignum_sub_p25519.S @@ -0,0 +1,68 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Subtract modulo p_25519, z := (x - y) mod p_25519 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_sub_p25519 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_p25519) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 +#define c x3 +#define l x4 +#define d0 x5 +#define d1 x6 +#define d2 x7 +#define d3 x8 + +S2N_BN_SYMBOL(bignum_sub_p25519): + +// First just subtract the numbers as [d3; d2; d1; d0] = x - y, +// with the inverted carry flag meaning CF <=> x >= y. + + ldp d0, d1, [x] + ldp l, c, [y] + subs d0, d0, l + sbcs d1, d1, c + ldp d2, d3, [x, #16] + ldp l, c, [y, #16] + sbcs d2, d2, l + sbcs d3, d3, c + +// Now if x < y we want to add back p_25519, which staying within 255 bits +// means subtracting 19, since p_25519 = 2^255 - 19. +// Let c be that constant 19 when x < y, zero otherwise. + + mov l, #19 + csel c, l, xzr, cc + +// Correct by adding the optional constant and masking to 255 bits + + subs d0, d0, c + sbcs d1, d1, xzr + sbcs d2, d2, xzr + sbc d3, d3, xzr + and d3, d3, #0x7FFFFFFFFFFFFFFF + +// Store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_ladderstep.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_ladderstep.S new file mode 100644 index 00000000000..941c83f795e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_ladderstep.S @@ -0,0 +1,962 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery ladder step on pairs of (X,Z)-projective curve25519 points +// +// extern void curve25519_ladderstep +// (uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b) +// +// If point = (X,1) and pp = (n * (X,1),[n+1] * (X,1)) then the output +// rr = (n' * (X,1),[n'+1] * (X,1)) where n' = 2 * n + b, with input +// b assumed to be 0 or 1; in this setting, each pair (X,Z) is assumed to +// be a projective y-free representation of an affine curve25519 point +// (X/Z,y), with the initial "differential" point having Z = 1 and X its +// affine x coordinate. In other words, the ladderstep operation is a +// combination of doubling, differential addition and optional swapping. +// +// Standard ARM ABI: X0 = rr, X1 = point, X2 = pp, X3 = b +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_ladderstep) + S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_ladderstep) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define rr x17 +#define point x19 +#define pp x20 +#define b x21 + +// Pointer-offset pairs for inputs and outputs + +#define x point, #0 +#define z point, #NUMSIZE +#define xn pp, #0 +#define zn pp, #NUMSIZE +#define xm pp, #(2*NUMSIZE) +#define zm pp, #(3*NUMSIZE) +#define res0 rr, #0 +#define res1 rr, #NUMSIZE +#define res2 rr, #(2*NUMSIZE) +#define res3 rr, #(3*NUMSIZE) + +// Pointer-offset pairs for temporaries on stack + +#define sm sp, #(0*NUMSIZE) +#define sn sp, #(1*NUMSIZE) +#define dm sp, #(2*NUMSIZE) +#define dn sp, #(3*NUMSIZE) +#define dmsn sp, #(4*NUMSIZE) +#define dnsm sp, #(5*NUMSIZE) +#define s sp, #(6*NUMSIZE) +#define d sp, #(7*NUMSIZE) +#define p sp, #(8*NUMSIZE) + +// More, but aliases to above + +#define sumx sm +#define sumz sn +#define dubx dm +#define dubz dn +#define e dubz +#define spro dnsm +#define dpro sumz + +// Total size to reserve on the stack + +#define NSPACE (9*NUMSIZE) + +// Macros wrapping up the basic field operations bignum_mul_p25519 +// and bignum_sqr_p25519, only trivially different from pure function +// call to those subroutines. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umaddl x5, w5, w0, x5 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + mov x3, #0x13 __LF \ + tst x10, #0x8000000000000000 __LF \ + csel x3, x3, xzr, pl __LF \ + subs x7, x7, x3 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + and x10, x10, #0x7fffffffffffffff __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +#define sqr_p25519(P0,P1) \ + ldp x10, x11, [P1] __LF \ + ldp x12, x13, [P1+16] __LF \ + umull x2, w10, w10 __LF \ + lsr x14, x10, #32 __LF \ + umull x3, w14, w14 __LF \ + umull x14, w10, w14 __LF \ + adds x2, x2, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x3, x3, x14 __LF \ + umull x4, w11, w11 __LF \ + lsr x14, x11, #32 __LF \ + umull x5, w14, w14 __LF \ + umull x14, w11, w14 __LF \ + mul x15, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x4, x4, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x5, x5, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x5, x5, xzr __LF \ + adds x3, x3, x15 __LF \ + adcs x4, x4, x16 __LF \ + adc x5, x5, xzr __LF \ + umull x6, w12, w12 __LF \ + lsr x14, x12, #32 __LF \ + umull x7, w14, w14 __LF \ + umull x14, w12, w14 __LF \ + adds x6, x6, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x7, x7, x14 __LF \ + umull x8, w13, w13 __LF \ + lsr x14, x13, #32 __LF \ + umull x9, w14, w14 __LF \ + umull x14, w13, w14 __LF \ + mul x15, x12, x13 __LF \ + umulh x16, x12, x13 __LF \ + adds x8, x8, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x9, x9, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x9, x9, xzr __LF \ + adds x7, x7, x15 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, xzr __LF \ + subs x10, x10, x12 __LF \ + sbcs x11, x11, x13 __LF \ + csetm x16, cc __LF \ + eor x10, x10, x16 __LF \ + subs x10, x10, x16 __LF \ + eor x11, x11, x16 __LF \ + sbc x11, x11, x16 __LF \ + adds x6, x6, x4 __LF \ + adcs x7, x7, x5 __LF \ + adcs x8, x8, xzr __LF \ + adc x9, x9, xzr __LF \ + umull x12, w10, w10 __LF \ + lsr x5, x10, #32 __LF \ + umull x13, w5, w5 __LF \ + umull x5, w10, w5 __LF \ + adds x12, x12, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x13, x13, x5 __LF \ + umull x15, w11, w11 __LF \ + lsr x5, x11, #32 __LF \ + umull x14, w5, w5 __LF \ + umull x5, w11, w5 __LF \ + mul x4, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x15, x15, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x14, x14, x5 __LF \ + adds x4, x4, x4 __LF \ + adcs x16, x16, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x13, x13, x4 __LF \ + adcs x15, x15, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x4, x2, x6 __LF \ + adcs x5, x3, x7 __LF \ + adcs x6, x6, x8 __LF \ + adcs x7, x7, x9 __LF \ + csetm x16, cc __LF \ + subs x4, x4, x12 __LF \ + sbcs x5, x5, x13 __LF \ + sbcs x6, x6, x15 __LF \ + sbcs x7, x7, x14 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, x16 __LF \ + mov x10, #0x26 __LF \ + umull x12, w6, w10 __LF \ + add x12, x12, w2, uxtw __LF \ + lsr x2, x2, #32 __LF \ + lsr x6, x6, #32 __LF \ + umaddl x6, w6, w10, x2 __LF \ + mov x2, x12 __LF \ + umull x12, w7, w10 __LF \ + add x12, x12, w3, uxtw __LF \ + lsr x3, x3, #32 __LF \ + lsr x7, x7, #32 __LF \ + umaddl x7, w7, w10, x3 __LF \ + mov x3, x12 __LF \ + umull x12, w8, w10 __LF \ + add x12, x12, w4, uxtw __LF \ + lsr x4, x4, #32 __LF \ + lsr x8, x8, #32 __LF \ + umaddl x8, w8, w10, x4 __LF \ + mov x4, x12 __LF \ + umull x12, w9, w10 __LF \ + add x12, x12, w5, uxtw __LF \ + lsr x5, x5, #32 __LF \ + lsr x9, x9, #32 __LF \ + umaddl x9, w9, w10, x5 __LF \ + mov x5, x12 __LF \ + lsr x13, x9, #31 __LF \ + mov x11, #0x13 __LF \ + umaddl x11, w11, w13, x11 __LF \ + add x2, x2, x11 __LF \ + adds x2, x2, x6, lsl #32 __LF \ + extr x10, x7, x6, #32 __LF \ + adcs x3, x3, x10 __LF \ + extr x10, x8, x7, #32 __LF \ + adcs x4, x4, x10 __LF \ + extr x10, x9, x8, #32 __LF \ + lsl x11, x13, #63 __LF \ + eor x5, x5, x11 __LF \ + adc x5, x5, x10 __LF \ + mov x10, #0x13 __LF \ + tst x5, #0x8000000000000000 __LF \ + csel x10, x10, xzr, pl __LF \ + subs x2, x2, x10 __LF \ + sbcs x3, x3, xzr __LF \ + sbcs x4, x4, xzr __LF \ + sbc x5, x5, xzr __LF \ + and x5, x5, #0x7fffffffffffffff __LF \ + stp x2, x3, [P0] __LF \ + stp x4, x5, [P0+16] + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umull x5, w5, w0 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + ldp x10, x11, [P1] __LF \ + ldp x12, x13, [P1+16] __LF \ + umull x2, w10, w10 __LF \ + lsr x14, x10, #32 __LF \ + umull x3, w14, w14 __LF \ + umull x14, w10, w14 __LF \ + adds x2, x2, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x3, x3, x14 __LF \ + umull x4, w11, w11 __LF \ + lsr x14, x11, #32 __LF \ + umull x5, w14, w14 __LF \ + umull x14, w11, w14 __LF \ + mul x15, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x4, x4, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x5, x5, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x5, x5, xzr __LF \ + adds x3, x3, x15 __LF \ + adcs x4, x4, x16 __LF \ + adc x5, x5, xzr __LF \ + umull x6, w12, w12 __LF \ + lsr x14, x12, #32 __LF \ + umull x7, w14, w14 __LF \ + umull x14, w12, w14 __LF \ + adds x6, x6, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x7, x7, x14 __LF \ + umull x8, w13, w13 __LF \ + lsr x14, x13, #32 __LF \ + umull x9, w14, w14 __LF \ + umull x14, w13, w14 __LF \ + mul x15, x12, x13 __LF \ + umulh x16, x12, x13 __LF \ + adds x8, x8, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x9, x9, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x9, x9, xzr __LF \ + adds x7, x7, x15 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, xzr __LF \ + subs x10, x10, x12 __LF \ + sbcs x11, x11, x13 __LF \ + csetm x16, cc __LF \ + eor x10, x10, x16 __LF \ + subs x10, x10, x16 __LF \ + eor x11, x11, x16 __LF \ + sbc x11, x11, x16 __LF \ + adds x6, x6, x4 __LF \ + adcs x7, x7, x5 __LF \ + adcs x8, x8, xzr __LF \ + adc x9, x9, xzr __LF \ + umull x12, w10, w10 __LF \ + lsr x5, x10, #32 __LF \ + umull x13, w5, w5 __LF \ + umull x5, w10, w5 __LF \ + adds x12, x12, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x13, x13, x5 __LF \ + umull x15, w11, w11 __LF \ + lsr x5, x11, #32 __LF \ + umull x14, w5, w5 __LF \ + umull x5, w11, w5 __LF \ + mul x4, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x15, x15, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x14, x14, x5 __LF \ + adds x4, x4, x4 __LF \ + adcs x16, x16, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x13, x13, x4 __LF \ + adcs x15, x15, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x4, x2, x6 __LF \ + adcs x5, x3, x7 __LF \ + adcs x6, x6, x8 __LF \ + adcs x7, x7, x9 __LF \ + csetm x16, cc __LF \ + subs x4, x4, x12 __LF \ + sbcs x5, x5, x13 __LF \ + sbcs x6, x6, x15 __LF \ + sbcs x7, x7, x14 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, x16 __LF \ + mov x10, #0x26 __LF \ + umull x12, w6, w10 __LF \ + add x12, x12, w2, uxtw __LF \ + lsr x2, x2, #32 __LF \ + lsr x6, x6, #32 __LF \ + umaddl x6, w6, w10, x2 __LF \ + mov x2, x12 __LF \ + umull x12, w7, w10 __LF \ + add x12, x12, w3, uxtw __LF \ + lsr x3, x3, #32 __LF \ + lsr x7, x7, #32 __LF \ + umaddl x7, w7, w10, x3 __LF \ + mov x3, x12 __LF \ + umull x12, w8, w10 __LF \ + add x12, x12, w4, uxtw __LF \ + lsr x4, x4, #32 __LF \ + lsr x8, x8, #32 __LF \ + umaddl x8, w8, w10, x4 __LF \ + mov x4, x12 __LF \ + umull x12, w9, w10 __LF \ + add x12, x12, w5, uxtw __LF \ + lsr x5, x5, #32 __LF \ + lsr x9, x9, #32 __LF \ + umaddl x9, w9, w10, x5 __LF \ + mov x5, x12 __LF \ + lsr x13, x9, #31 __LF \ + mov x11, #0x13 __LF \ + umull x11, w11, w13 __LF \ + add x2, x2, x11 __LF \ + adds x2, x2, x6, lsl #32 __LF \ + extr x10, x7, x6, #32 __LF \ + adcs x3, x3, x10 __LF \ + extr x10, x8, x7, #32 __LF \ + adcs x4, x4, x10 __LF \ + extr x10, x9, x8, #32 __LF \ + lsl x11, x13, #63 __LF \ + eor x5, x5, x11 __LF \ + adc x5, x5, x10 __LF \ + stp x2, x3, [P0] __LF \ + stp x4, x5, [P0+16] + +// Plain 4-digit add without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result + +#define add_4(p0,p1,p2) \ + ldp x0, x1, [p1] __LF \ + ldp x4, x5, [p2] __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + ldp x2, x3, [p1+16] __LF \ + ldp x6, x7, [p2+16] __LF \ + adcs x2, x2, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [p0] __LF \ + stp x2, x3, [p0+16] + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(p0,p1,p2) \ + ldp x5, x6, [p1] __LF \ + ldp x4, x3, [p2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [p1+16] __LF \ + ldp x4, x3, [p2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x3, #19 __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + mov x4, #0x8000000000000000 __LF \ + sbc x8, x8, x4 __LF \ + stp x5, x6, [p0] __LF \ + stp x7, x8, [p0+16] + +// Modular addition with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(p0,p1,p2) \ + ldp x5, x6, [p1] __LF \ + ldp x4, x3, [p2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [p1+16] __LF \ + ldp x4, x3, [p2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [p0] __LF \ + stp x7, x8, [p0+16] + +// Combined z = c * x + y with reduction only < 2 * p_25519 +// where c is initially in the X1 register. It is assumed +// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a +// high mul in the final part. + +#define cmadd_4(p0,p2,p3) \ + ldp x7, x8, [p2] __LF \ + ldp x9, x10, [p2+16] __LF \ + mul x3, x1, x7 __LF \ + mul x4, x1, x8 __LF \ + mul x5, x1, x9 __LF \ + mul x6, x1, x10 __LF \ + umulh x7, x1, x7 __LF \ + umulh x8, x1, x8 __LF \ + umulh x9, x1, x9 __LF \ + umulh x10, x1, x10 __LF \ + adds x4, x4, x7 __LF \ + adcs x5, x5, x8 __LF \ + adcs x6, x6, x9 __LF \ + adc x10, x10, xzr __LF \ + ldp x7, x8, [p3] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x7, x8, [p3+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + adc x10, x10, xzr __LF \ + cmn x6, x6 __LF \ + bic x6, x6, #0x8000000000000000 __LF \ + adc x8, x10, x10 __LF \ + mov x9, #19 __LF \ + mul x7, x8, x9 __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [p0] __LF \ + stp x5, x6, [p0+16] + +// Multiplex: z := if NZ then x else y + +#define mux_4(p0,p1,p2) \ + ldp x0, x1, [p1] __LF \ + ldp x2, x3, [p2] __LF \ + csel x0, x0, x2, ne __LF \ + csel x1, x1, x3, ne __LF \ + stp x0, x1, [p0] __LF \ + ldp x0, x1, [p1+16] __LF \ + ldp x2, x3, [p2+16] __LF \ + csel x0, x0, x2, ne __LF \ + csel x1, x1, x3, ne __LF \ + stp x0, x1, [p0+16] + +// Paired multiplex: (w,z) := if NZ then (y,x) else (x,y) + +#define muxpair_4(p0,p1,p2,p3) \ + ldp x0, x1, [p2] __LF \ + ldp x2, x3, [p3] __LF \ + csel x4, x0, x2, eq __LF \ + csel x6, x0, x2, ne __LF \ + csel x5, x1, x3, eq __LF \ + csel x7, x1, x3, ne __LF \ + stp x4, x5, [p0] __LF \ + stp x6, x7, [p1] __LF \ + ldp x0, x1, [p2+16] __LF \ + ldp x2, x3, [p3+16] __LF \ + csel x4, x0, x2, eq __LF \ + csel x6, x0, x2, ne __LF \ + csel x5, x1, x3, eq __LF \ + csel x7, x1, x3, ne __LF \ + stp x4, x5, [p0+16] __LF \ + stp x6, x7, [p1+16] + +S2N_BN_SYMBOL(curve25519_ladderstep): + +// Save regs and make room for temporaries + + stp x19, x30, [sp, -16]! + stp x20, x21, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the input arguments to stable places + + mov rr, x0 + mov point, x1 + mov pp, x2 + mov b, x3 + +// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn +// The adds don't need any normalization as they're fed to muls +// Just make sure the subs fit in 4 digits + + sub_4(dm, xm, zm) + add_4(sn, xn, zn) + sub_4(dn, xn, zn) + add_4(sm, xm, zm) + +// ADDING: dmsn = dm * sn; dnsm = sm * dn +// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) + + mul_4(dmsn,dm,sn) + + cmp b, xzr + mux_4(d,dm,dn) + mux_4(s,sm,sn) + + mul_4(dnsm,sm,dn) + +// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits + + sqr_4(d,d) + +// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 +// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits + + sub_twice4(dpro,dmsn,dnsm) + sqr_4(s,s) + add_twice4(spro,dmsn,dnsm) + sqr_4(dpro,dpro) + +// DOUBLING: p = 4 * xt * zt = s - d + + sub_twice4(p,s,d) + +// ADDING: sumx = (dmsn + dnsm)^2 + + sqr_p25519(sumx,spro) + +// DOUBLING: e = 121666 * p + d + + mov x1, 0xdb42 + orr x1, x1, 0x10000 + cmadd_4(e,p,d) + +// DOUBLING: dubx = (xt + zt)^2 * (xt - zt)^2 = s * d + + mul_p25519(dubx,s,d) + +// ADDING: sumz = x * (dmsn - dnsm)^2 + + mul_p25519(sumz,dpro,x) + +// DOUBLING: dubz = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) +// = p * (d + 121666 * p) + + mul_p25519(dubz,p,e) + +// Multiplex the outputs + + cmp b, xzr + muxpair_4(res0,res2,dubx,sumx) + muxpair_4(res1,res3,dubz,sumz) + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x20, x21, [sp], 16 + ldp x19, x30, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_ladderstep_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_ladderstep_alt.S new file mode 100644 index 00000000000..9aaaf502cc8 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_ladderstep_alt.S @@ -0,0 +1,686 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery ladder step on pairs of (X,Z)-projective curve25519 points +// +// extern void curve25519_ladderstep_alt +// (uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b) +// +// If point = (X,1) and pp = (n * (X,1),[n+1] * (X,1)) then the output +// rr = (n' * (X,1),[n'+1] * (X,1)) where n' = 2 * n + b, with input +// b assumed to be 0 or 1; in this setting, each pair (X,Z) is assumed to +// be a projective y-free representation of an affine curve25519 point +// (X/Z,y), with the initial "differential" point having Z = 1 and X its +// affine x coordinate. In other words, the ladderstep operation is a +// combination of doubling, differential addition and optional swapping. +// +// Standard ARM ABI: X0 = rr, X1 = point, X2 = pp, X3 = b +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_ladderstep_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_ladderstep_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define rr x17 +#define point x19 +#define pp x20 +#define b x21 + +// Pointer-offset pairs for inputs and outputs + +#define x point, #0 +#define z point, #NUMSIZE +#define xn pp, #0 +#define zn pp, #NUMSIZE +#define xm pp, #(2*NUMSIZE) +#define zm pp, #(3*NUMSIZE) +#define res0 rr, #0 +#define res1 rr, #NUMSIZE +#define res2 rr, #(2*NUMSIZE) +#define res3 rr, #(3*NUMSIZE) + +// Pointer-offset pairs for temporaries on stack + +#define sm sp, #(0*NUMSIZE) +#define sn sp, #(1*NUMSIZE) +#define dm sp, #(2*NUMSIZE) +#define dn sp, #(3*NUMSIZE) +#define dmsn sp, #(4*NUMSIZE) +#define dnsm sp, #(5*NUMSIZE) +#define s sp, #(6*NUMSIZE) +#define d sp, #(7*NUMSIZE) +#define p sp, #(8*NUMSIZE) + +// More, but aliases to above + +#define sumx sm +#define sumz sn +#define dubx dm +#define dubz dn +#define e dubz +#define spro dnsm +#define dpro sumz + +// Total size to reserve on the stack + +#define NSPACE (9*NUMSIZE) + +// Macros wrapping up the basic field operations bignum_mul_p25519_alt +// and bignum_sqr_p25519_alt, only trivially different from pure function +// call to those subroutines. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + orr x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + madd x11, x7, x8, x7 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x15, x15, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + and x15, x15, #0x7fffffffffffffff __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] + +#define sqr_p25519(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mov x3, #0x26 __LF \ + mul x7, x3, x12 __LF \ + umulh x4, x3, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, x3, x13 __LF \ + umulh x13, x3, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, x3, x14 __LF \ + umulh x14, x3, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, x3, x6 __LF \ + umulh x6, x3, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + cmn x11, x11 __LF \ + orr x11, x11, #0x8000000000000000 __LF \ + adc x2, x12, x12 __LF \ + mov x3, #0x13 __LF \ + madd x7, x3, x2, x3 __LF \ + adds x8, x8, x7 __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adcs x11, x11, xzr __LF \ + csel x3, x3, xzr, cc __LF \ + subs x8, x8, x3 __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbc x11, x11, xzr __LF \ + and x11, x11, #0x7fffffffffffffff __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] __LF \ + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + bic x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + mul x11, x7, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adc x15, x15, xzr __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mov x3, #0x26 __LF \ + mul x7, x3, x12 __LF \ + umulh x4, x3, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, x3, x13 __LF \ + umulh x13, x3, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, x3, x14 __LF \ + umulh x14, x3, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, x3, x6 __LF \ + umulh x6, x3, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + cmn x11, x11 __LF \ + bic x11, x11, #0x8000000000000000 __LF \ + adc x2, x12, x12 __LF \ + mov x3, #0x13 __LF \ + mul x7, x3, x2 __LF \ + adds x8, x8, x7 __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adc x11, x11, xzr __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Plain 4-digit add without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result + +#define add_4(p0,p1,p2) \ + ldp x0, x1, [p1] __LF \ + ldp x4, x5, [p2] __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + ldp x2, x3, [p1+16] __LF \ + ldp x6, x7, [p2+16] __LF \ + adcs x2, x2, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [p0] __LF \ + stp x2, x3, [p0+16] + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(p0,p1,p2) \ + ldp x5, x6, [p1] __LF \ + ldp x4, x3, [p2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [p1+16] __LF \ + ldp x4, x3, [p2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x3, #19 __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + mov x4, #0x8000000000000000 __LF \ + sbc x8, x8, x4 __LF \ + stp x5, x6, [p0] __LF \ + stp x7, x8, [p0+16] + +// Modular addition with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(p0,p1,p2) \ + ldp x5, x6, [p1] __LF \ + ldp x4, x3, [p2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [p1+16] __LF \ + ldp x4, x3, [p2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [p0] __LF \ + stp x7, x8, [p0+16] + +// Combined z = c * x + y with reduction only < 2 * p_25519 +// where c is initially in the X1 register. It is assumed +// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a +// high mul in the final part. + +#define cmadd_4(p0,p2,p3) \ + ldp x7, x8, [p2] __LF \ + ldp x9, x10, [p2+16] __LF \ + mul x3, x1, x7 __LF \ + mul x4, x1, x8 __LF \ + mul x5, x1, x9 __LF \ + mul x6, x1, x10 __LF \ + umulh x7, x1, x7 __LF \ + umulh x8, x1, x8 __LF \ + umulh x9, x1, x9 __LF \ + umulh x10, x1, x10 __LF \ + adds x4, x4, x7 __LF \ + adcs x5, x5, x8 __LF \ + adcs x6, x6, x9 __LF \ + adc x10, x10, xzr __LF \ + ldp x7, x8, [p3] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x7, x8, [p3+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + adc x10, x10, xzr __LF \ + cmn x6, x6 __LF \ + bic x6, x6, #0x8000000000000000 __LF \ + adc x8, x10, x10 __LF \ + mov x9, #19 __LF \ + mul x7, x8, x9 __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [p0] __LF \ + stp x5, x6, [p0+16] + +// Multiplex: z := if NZ then x else y + +#define mux_4(p0,p1,p2) \ + ldp x0, x1, [p1] __LF \ + ldp x2, x3, [p2] __LF \ + csel x0, x0, x2, ne __LF \ + csel x1, x1, x3, ne __LF \ + stp x0, x1, [p0] __LF \ + ldp x0, x1, [p1+16] __LF \ + ldp x2, x3, [p2+16] __LF \ + csel x0, x0, x2, ne __LF \ + csel x1, x1, x3, ne __LF \ + stp x0, x1, [p0+16] + +// Paired multiplex: (w,z) := if NZ then (y,x) else (x,y) + +#define muxpair_4(p0,p1,p2,p3) \ + ldp x0, x1, [p2] __LF \ + ldp x2, x3, [p3] __LF \ + csel x4, x0, x2, eq __LF \ + csel x6, x0, x2, ne __LF \ + csel x5, x1, x3, eq __LF \ + csel x7, x1, x3, ne __LF \ + stp x4, x5, [p0] __LF \ + stp x6, x7, [p1] __LF \ + ldp x0, x1, [p2+16] __LF \ + ldp x2, x3, [p3+16] __LF \ + csel x4, x0, x2, eq __LF \ + csel x6, x0, x2, ne __LF \ + csel x5, x1, x3, eq __LF \ + csel x7, x1, x3, ne __LF \ + stp x4, x5, [p0+16] __LF \ + stp x6, x7, [p1+16] + +S2N_BN_SYMBOL(curve25519_ladderstep_alt): + +// Save regs and make room for temporaries + + stp x19, x30, [sp, -16]! + stp x20, x21, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the input arguments to stable places + + mov rr, x0 + mov point, x1 + mov pp, x2 + mov b, x3 + +// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn +// The adds don't need any normalization as they're fed to muls +// Just make sure the subs fit in 4 digits + + sub_4(dm, xm, zm) + add_4(sn, xn, zn) + sub_4(dn, xn, zn) + add_4(sm, xm, zm) + +// ADDING: dmsn = dm * sn; dnsm = sm * dn +// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) + + mul_4(dmsn,dm,sn) + + cmp b, xzr + mux_4(d,dm,dn) + mux_4(s,sm,sn) + + mul_4(dnsm,sm,dn) + +// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits + + sqr_4(d,d) + +// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 +// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits + + sub_twice4(dpro,dmsn,dnsm) + sqr_4(s,s) + add_twice4(spro,dmsn,dnsm) + sqr_4(dpro,dpro) + +// DOUBLING: p = 4 * xt * zt = s - d + + sub_twice4(p,s,d) + +// ADDING: sumx = (dmsn + dnsm)^2 + + sqr_p25519(sumx,spro) + +// DOUBLING: e = 121666 * p + d + + mov x1, 0xdb42 + orr x1, x1, 0x10000 + cmadd_4(e,p,d) + +// DOUBLING: dubx = (xt + zt)^2 * (xt - zt)^2 = s * d + + mul_p25519(dubx,s,d) + +// ADDING: sumz = x * (dmsn - dnsm)^2 + + mul_p25519(sumz,dpro,x) + +// DOUBLING: dubz = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) +// = p * (d + 121666 * p) + + mul_p25519(dubz,p,e) + +// Multiplex the outputs + + cmp b, xzr + muxpair_4(res0,res2,dubx,sumx) + muxpair_4(res1,res3,dubz,sumz) + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x20, x21, [sp], 16 + ldp x19, x30, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_pxscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_pxscalarmul.S new file mode 100644 index 00000000000..b28051467be --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_pxscalarmul.S @@ -0,0 +1,995 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Projective scalar multiplication, x coordinate only, for curve25519 +// Inputs scalar[4], point[4]; output res[8] +// +// extern void curve25519_pxscalarmul +// (uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4]) +// +// Given the X coordinate of an input point = (X,Y) on curve25519, which +// could also be part of a projective representation (X,Y,1) of the same +// point, returns a projective representation (X,Z) = scalar * point, where +// scalar is a 256-bit number. The corresponding affine form is (X/Z,Y'), +// X/Z meaning division modulo 2^255-19, and Y' not being computed by +// this function (nor is any Y coordinate of the input point used). +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_pxscalarmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_pxscalarmul) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence +// and additional registers for loop counter and swap flag + +#define res x17 +#define point x19 +#define scalar x20 +#define i x21 +#define swap x22 + +// Pointers to input x coord (we don't use y or z) and output coords. + +#define x point, #0 +#define resx res, #0 +#define resz res, #NUMSIZE + +// Pointer-offset pairs for temporaries on stack with some aliasing. + +#define zm sp, #(0*NUMSIZE) +#define sm sp, #(0*NUMSIZE) +#define dpro sp, #(0*NUMSIZE) + +#define sn sp, #(1*NUMSIZE) + +#define dm sp, #(2*NUMSIZE) + +#define zn sp, #(3*NUMSIZE) +#define dn sp, #(3*NUMSIZE) +#define e sp, #(3*NUMSIZE) + +#define dmsn sp, #(4*NUMSIZE) +#define p sp, #(4*NUMSIZE) + +#define xm sp, #(5*NUMSIZE) +#define dnsm sp, #(5*NUMSIZE) +#define spro sp, #(5*NUMSIZE) + +#define xn sp, #(6*NUMSIZE) +#define s sp, #(6*NUMSIZE) + +#define d sp, #(7*NUMSIZE) + +// Total size to reserve on the stack + +#define NSPACE (8*NUMSIZE) + +// Macros wrapping up the basic field operations bignum_mul_p25519 +// and bignum_sqr_p25519, only trivially different from pure function +// call to those subroutines. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umaddl x5, w5, w0, x5 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + mov x3, #0x13 __LF \ + tst x10, #0x8000000000000000 __LF \ + csel x3, x3, xzr, pl __LF \ + subs x7, x7, x3 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + and x10, x10, #0x7fffffffffffffff __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +#define sqr_p25519(P0,P1) \ + ldp x10, x11, [P1] __LF \ + ldp x12, x13, [P1+16] __LF \ + umull x2, w10, w10 __LF \ + lsr x14, x10, #32 __LF \ + umull x3, w14, w14 __LF \ + umull x14, w10, w14 __LF \ + adds x2, x2, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x3, x3, x14 __LF \ + umull x4, w11, w11 __LF \ + lsr x14, x11, #32 __LF \ + umull x5, w14, w14 __LF \ + umull x14, w11, w14 __LF \ + mul x15, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x4, x4, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x5, x5, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x5, x5, xzr __LF \ + adds x3, x3, x15 __LF \ + adcs x4, x4, x16 __LF \ + adc x5, x5, xzr __LF \ + umull x6, w12, w12 __LF \ + lsr x14, x12, #32 __LF \ + umull x7, w14, w14 __LF \ + umull x14, w12, w14 __LF \ + adds x6, x6, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x7, x7, x14 __LF \ + umull x8, w13, w13 __LF \ + lsr x14, x13, #32 __LF \ + umull x9, w14, w14 __LF \ + umull x14, w13, w14 __LF \ + mul x15, x12, x13 __LF \ + umulh x16, x12, x13 __LF \ + adds x8, x8, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x9, x9, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x9, x9, xzr __LF \ + adds x7, x7, x15 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, xzr __LF \ + subs x10, x10, x12 __LF \ + sbcs x11, x11, x13 __LF \ + csetm x16, cc __LF \ + eor x10, x10, x16 __LF \ + subs x10, x10, x16 __LF \ + eor x11, x11, x16 __LF \ + sbc x11, x11, x16 __LF \ + adds x6, x6, x4 __LF \ + adcs x7, x7, x5 __LF \ + adcs x8, x8, xzr __LF \ + adc x9, x9, xzr __LF \ + umull x12, w10, w10 __LF \ + lsr x5, x10, #32 __LF \ + umull x13, w5, w5 __LF \ + umull x5, w10, w5 __LF \ + adds x12, x12, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x13, x13, x5 __LF \ + umull x15, w11, w11 __LF \ + lsr x5, x11, #32 __LF \ + umull x14, w5, w5 __LF \ + umull x5, w11, w5 __LF \ + mul x4, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x15, x15, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x14, x14, x5 __LF \ + adds x4, x4, x4 __LF \ + adcs x16, x16, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x13, x13, x4 __LF \ + adcs x15, x15, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x4, x2, x6 __LF \ + adcs x5, x3, x7 __LF \ + adcs x6, x6, x8 __LF \ + adcs x7, x7, x9 __LF \ + csetm x16, cc __LF \ + subs x4, x4, x12 __LF \ + sbcs x5, x5, x13 __LF \ + sbcs x6, x6, x15 __LF \ + sbcs x7, x7, x14 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, x16 __LF \ + mov x10, #0x26 __LF \ + umull x12, w6, w10 __LF \ + add x12, x12, w2, uxtw __LF \ + lsr x2, x2, #32 __LF \ + lsr x6, x6, #32 __LF \ + umaddl x6, w6, w10, x2 __LF \ + mov x2, x12 __LF \ + umull x12, w7, w10 __LF \ + add x12, x12, w3, uxtw __LF \ + lsr x3, x3, #32 __LF \ + lsr x7, x7, #32 __LF \ + umaddl x7, w7, w10, x3 __LF \ + mov x3, x12 __LF \ + umull x12, w8, w10 __LF \ + add x12, x12, w4, uxtw __LF \ + lsr x4, x4, #32 __LF \ + lsr x8, x8, #32 __LF \ + umaddl x8, w8, w10, x4 __LF \ + mov x4, x12 __LF \ + umull x12, w9, w10 __LF \ + add x12, x12, w5, uxtw __LF \ + lsr x5, x5, #32 __LF \ + lsr x9, x9, #32 __LF \ + umaddl x9, w9, w10, x5 __LF \ + mov x5, x12 __LF \ + lsr x13, x9, #31 __LF \ + mov x11, #0x13 __LF \ + umaddl x11, w11, w13, x11 __LF \ + add x2, x2, x11 __LF \ + adds x2, x2, x6, lsl #32 __LF \ + extr x10, x7, x6, #32 __LF \ + adcs x3, x3, x10 __LF \ + extr x10, x8, x7, #32 __LF \ + adcs x4, x4, x10 __LF \ + extr x10, x9, x8, #32 __LF \ + lsl x11, x13, #63 __LF \ + eor x5, x5, x11 __LF \ + adc x5, x5, x10 __LF \ + mov x10, #0x13 __LF \ + tst x5, #0x8000000000000000 __LF \ + csel x10, x10, xzr, pl __LF \ + subs x2, x2, x10 __LF \ + sbcs x3, x3, xzr __LF \ + sbcs x4, x4, xzr __LF \ + sbc x5, x5, xzr __LF \ + and x5, x5, #0x7fffffffffffffff __LF \ + stp x2, x3, [P0] __LF \ + stp x4, x5, [P0+16] + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umull x5, w5, w0 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + ldp x10, x11, [P1] __LF \ + ldp x12, x13, [P1+16] __LF \ + umull x2, w10, w10 __LF \ + lsr x14, x10, #32 __LF \ + umull x3, w14, w14 __LF \ + umull x14, w10, w14 __LF \ + adds x2, x2, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x3, x3, x14 __LF \ + umull x4, w11, w11 __LF \ + lsr x14, x11, #32 __LF \ + umull x5, w14, w14 __LF \ + umull x14, w11, w14 __LF \ + mul x15, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x4, x4, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x5, x5, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x5, x5, xzr __LF \ + adds x3, x3, x15 __LF \ + adcs x4, x4, x16 __LF \ + adc x5, x5, xzr __LF \ + umull x6, w12, w12 __LF \ + lsr x14, x12, #32 __LF \ + umull x7, w14, w14 __LF \ + umull x14, w12, w14 __LF \ + adds x6, x6, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x7, x7, x14 __LF \ + umull x8, w13, w13 __LF \ + lsr x14, x13, #32 __LF \ + umull x9, w14, w14 __LF \ + umull x14, w13, w14 __LF \ + mul x15, x12, x13 __LF \ + umulh x16, x12, x13 __LF \ + adds x8, x8, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x9, x9, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x9, x9, xzr __LF \ + adds x7, x7, x15 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, xzr __LF \ + subs x10, x10, x12 __LF \ + sbcs x11, x11, x13 __LF \ + csetm x16, cc __LF \ + eor x10, x10, x16 __LF \ + subs x10, x10, x16 __LF \ + eor x11, x11, x16 __LF \ + sbc x11, x11, x16 __LF \ + adds x6, x6, x4 __LF \ + adcs x7, x7, x5 __LF \ + adcs x8, x8, xzr __LF \ + adc x9, x9, xzr __LF \ + umull x12, w10, w10 __LF \ + lsr x5, x10, #32 __LF \ + umull x13, w5, w5 __LF \ + umull x5, w10, w5 __LF \ + adds x12, x12, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x13, x13, x5 __LF \ + umull x15, w11, w11 __LF \ + lsr x5, x11, #32 __LF \ + umull x14, w5, w5 __LF \ + umull x5, w11, w5 __LF \ + mul x4, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x15, x15, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x14, x14, x5 __LF \ + adds x4, x4, x4 __LF \ + adcs x16, x16, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x13, x13, x4 __LF \ + adcs x15, x15, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x4, x2, x6 __LF \ + adcs x5, x3, x7 __LF \ + adcs x6, x6, x8 __LF \ + adcs x7, x7, x9 __LF \ + csetm x16, cc __LF \ + subs x4, x4, x12 __LF \ + sbcs x5, x5, x13 __LF \ + sbcs x6, x6, x15 __LF \ + sbcs x7, x7, x14 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, x16 __LF \ + mov x10, #0x26 __LF \ + umull x12, w6, w10 __LF \ + add x12, x12, w2, uxtw __LF \ + lsr x2, x2, #32 __LF \ + lsr x6, x6, #32 __LF \ + umaddl x6, w6, w10, x2 __LF \ + mov x2, x12 __LF \ + umull x12, w7, w10 __LF \ + add x12, x12, w3, uxtw __LF \ + lsr x3, x3, #32 __LF \ + lsr x7, x7, #32 __LF \ + umaddl x7, w7, w10, x3 __LF \ + mov x3, x12 __LF \ + umull x12, w8, w10 __LF \ + add x12, x12, w4, uxtw __LF \ + lsr x4, x4, #32 __LF \ + lsr x8, x8, #32 __LF \ + umaddl x8, w8, w10, x4 __LF \ + mov x4, x12 __LF \ + umull x12, w9, w10 __LF \ + add x12, x12, w5, uxtw __LF \ + lsr x5, x5, #32 __LF \ + lsr x9, x9, #32 __LF \ + umaddl x9, w9, w10, x5 __LF \ + mov x5, x12 __LF \ + lsr x13, x9, #31 __LF \ + mov x11, #0x13 __LF \ + umull x11, w11, w13 __LF \ + add x2, x2, x11 __LF \ + adds x2, x2, x6, lsl #32 __LF \ + extr x10, x7, x6, #32 __LF \ + adcs x3, x3, x10 __LF \ + extr x10, x8, x7, #32 __LF \ + adcs x4, x4, x10 __LF \ + extr x10, x9, x8, #32 __LF \ + lsl x11, x13, #63 __LF \ + eor x5, x5, x11 __LF \ + adc x5, x5, x10 __LF \ + stp x2, x3, [P0] __LF \ + stp x4, x5, [P0+16] + +// Plain 4-digit add without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result + +#define add_4(p0,p1,p2) \ + ldp x0, x1, [p1] __LF \ + ldp x4, x5, [p2] __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + ldp x2, x3, [p1+16] __LF \ + ldp x6, x7, [p2+16] __LF \ + adcs x2, x2, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [p0] __LF \ + stp x2, x3, [p0+16] + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(p0,p1,p2) \ + ldp x5, x6, [p1] __LF \ + ldp x4, x3, [p2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [p1+16] __LF \ + ldp x4, x3, [p2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x3, #19 __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + mov x4, #0x8000000000000000 __LF \ + sbc x8, x8, x4 __LF \ + stp x5, x6, [p0] __LF \ + stp x7, x8, [p0+16] + +// Modular addition with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(p0,p1,p2) \ + ldp x5, x6, [p1] __LF \ + ldp x4, x3, [p2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [p1+16] __LF \ + ldp x4, x3, [p2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [p0] __LF \ + stp x7, x8, [p0+16] + +// Combined z = c * x + y with reduction only < 2 * p_25519 +// where c is initially in the X1 register. It is assumed +// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a +// high mul in the final part. + +#define cmadd_4(p0,p2,p3) \ + ldp x7, x8, [p2] __LF \ + ldp x9, x10, [p2+16] __LF \ + mul x3, x1, x7 __LF \ + mul x4, x1, x8 __LF \ + mul x5, x1, x9 __LF \ + mul x6, x1, x10 __LF \ + umulh x7, x1, x7 __LF \ + umulh x8, x1, x8 __LF \ + umulh x9, x1, x9 __LF \ + umulh x10, x1, x10 __LF \ + adds x4, x4, x7 __LF \ + adcs x5, x5, x8 __LF \ + adcs x6, x6, x9 __LF \ + adc x10, x10, xzr __LF \ + ldp x7, x8, [p3] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x7, x8, [p3+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + adc x10, x10, xzr __LF \ + cmn x6, x6 __LF \ + bic x6, x6, #0x8000000000000000 __LF \ + adc x8, x10, x10 __LF \ + mov x9, #19 __LF \ + mul x7, x8, x9 __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [p0] __LF \ + stp x5, x6, [p0+16] + +// Multiplex: z := if NZ then x else y + +#define mux_4(p0,p1,p2) \ + ldp x0, x1, [p1] __LF \ + ldp x2, x3, [p2] __LF \ + csel x0, x0, x2, ne __LF \ + csel x1, x1, x3, ne __LF \ + stp x0, x1, [p0] __LF \ + ldp x0, x1, [p1+16] __LF \ + ldp x2, x3, [p2+16] __LF \ + csel x0, x0, x2, ne __LF \ + csel x1, x1, x3, ne __LF \ + stp x0, x1, [p0+16] + +S2N_BN_SYMBOL(curve25519_pxscalarmul): + +// Save regs and make room for temporaries + + stp x19, x22, [sp, -16]! + stp x20, x21, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the input arguments to stable places + + mov res, x0 + mov scalar, x1 + mov point, x2 + +// Initialize (xn,zn) = (1,0) and (xm,zm) = (x,1) with swap = 0 + + mov x2, #1 + stp x2, xzr, [xn] + stp xzr, xzr, [xn+16] + stp xzr, xzr, [zn] + stp xzr, xzr, [zn+16] + ldp x0, x1, [x] + stp x0, x1, [xm] + ldp x0, x1, [x+16] + stp x0, x1, [xm+16] + ldp x0, x1, [x+32] + stp x2, xzr, [zm] + stp xzr, xzr, [zm+16] + mov swap, xzr + +// The outer loop from i = 255, ..., i = 0 (inclusive) + + mov i, #255 + +curve25519_pxscalarmul_loop: + +// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn +// The adds don't need any normalization as they're fed to muls +// Just make sure the subs fit in 4 digits + + sub_4(dm, xm, zm) + add_4(sn, xn, zn) + sub_4(dn, xn, zn) + add_4(sm, xm, zm) + +// ADDING: dmsn = dm * sn; dnsm = sm * dn +// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) + + mul_4(dmsn,sn,dm) + + lsr x0, i, #6 + ldr x2, [scalar, x0, lsl #3] + lsr x2, x2, i + and x2, x2, #1 + + cmp swap, x2 + mov swap, x2 + + mux_4(d,dm,dn) + mux_4(s,sm,sn) + + mul_4(dnsm,sm,dn) + +// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits + + sqr_4(d,d) + +// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 +// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits + + sub_twice4(dpro,dmsn,dnsm) + sqr_4(s,s) + add_twice4(spro,dmsn,dnsm) + sqr_4(dpro,dpro) + +// DOUBLING: p = 4 * xt * zt = s - d + + sub_twice4(p,s,d) + +// ADDING: xm' = (dmsn + dnsm)^2 + + sqr_p25519(xm,spro) + +// DOUBLING: e = 121666 * p + d + + mov x1, 0xdb42 + orr x1, x1, 0x10000 + cmadd_4(e,p,d) + +// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d + + mul_p25519(xn,s,d) + +// ADDING: zm' = x * (dmsn - dnsm)^2 + + mul_p25519(zm,dpro,x) + +// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) +// = p * (d + 121666 * p) + + mul_p25519(zn,p,e) + +// Loop down as far as 0 (inclusive) + + subs i, i, #1 + bcs curve25519_pxscalarmul_loop + +// The main loop does not handle the special input of the 2-torsion +// point = (0,0). In that case we may get a spurious (0,0) as output +// when we want (0,1) [for odd scalar] or (1,0) [for even scalar]. +// Test if x = 0 (this is equivalent for curve25519 to y = 0) and if +// so, patch zm = 1 [for odd multiple], xn = 1 [for even multiple]. + + ldp x0, x1, [point] + orr x0, x0, x1 + ldp x2, x3, [point, #16] + orr x2, x2, x3 + orr x0, x0, x2 + cmp x0, xzr + cset x0, eq + ldr x1, [zm] + orr x1, x1, x0 + str x1, [zm] + ldr x2, [xn] + orr x2, x2, x0 + str x2, [xn] + +// Multiplex into the final outputs + + cmp swap, xzr + + mux_4(resx,xm,xn) + mux_4(resz,zm,zn) + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x20, x21, [sp], 16 + ldp x19, x22, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_pxscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_pxscalarmul_alt.S new file mode 100644 index 00000000000..ef62e32cf1e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_pxscalarmul_alt.S @@ -0,0 +1,719 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Projective scalar multiplication, x coordinate only, for curve25519 +// Inputs scalar[4], point[4]; output res[8] +// +// extern void curve25519_pxscalarmul_alt +// (uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4]) +// +// Given the X coordinate of an input point = (X,Y) on curve25519, which +// could also be part of a projective representation (X,Y,1) of the same +// point, returns a projective representation (X,Z) = scalar * point, where +// scalar is a 256-bit number. The corresponding affine form is (X/Z,Y'), +// X/Z meaning division modulo 2^255-19, and Y' not being computed by +// this function (nor is any Y coordinate of the input point used). +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_pxscalarmul_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_pxscalarmul_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence +// and additional registers for loop counter and swap flag + +#define res x17 +#define point x19 +#define scalar x20 +#define i x21 +#define swap x22 + +// Pointers to input x coord (we don't use y or z) and output coords. + +#define x point, #0 +#define resx res, #0 +#define resz res, #NUMSIZE + +// Pointer-offset pairs for temporaries on stack with some aliasing. + +#define zm sp, #(0*NUMSIZE) +#define sm sp, #(0*NUMSIZE) +#define dpro sp, #(0*NUMSIZE) + +#define sn sp, #(1*NUMSIZE) + +#define dm sp, #(2*NUMSIZE) + +#define zn sp, #(3*NUMSIZE) +#define dn sp, #(3*NUMSIZE) +#define e sp, #(3*NUMSIZE) + +#define dmsn sp, #(4*NUMSIZE) +#define p sp, #(4*NUMSIZE) + +#define xm sp, #(5*NUMSIZE) +#define dnsm sp, #(5*NUMSIZE) +#define spro sp, #(5*NUMSIZE) + +#define xn sp, #(6*NUMSIZE) +#define s sp, #(6*NUMSIZE) + +#define d sp, #(7*NUMSIZE) + +// Total size to reserve on the stack + +#define NSPACE (8*NUMSIZE) + +// Macros wrapping up the basic field operations bignum_mul_p25519_alt +// and bignum_sqr_p25519_alt, only trivially different from pure function +// call to those subroutines. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + orr x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + madd x11, x7, x8, x7 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x15, x15, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + and x15, x15, #0x7fffffffffffffff __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] + +#define sqr_p25519(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mov x3, #0x26 __LF \ + mul x7, x3, x12 __LF \ + umulh x4, x3, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, x3, x13 __LF \ + umulh x13, x3, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, x3, x14 __LF \ + umulh x14, x3, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, x3, x6 __LF \ + umulh x6, x3, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + cmn x11, x11 __LF \ + orr x11, x11, #0x8000000000000000 __LF \ + adc x2, x12, x12 __LF \ + mov x3, #0x13 __LF \ + madd x7, x3, x2, x3 __LF \ + adds x8, x8, x7 __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adcs x11, x11, xzr __LF \ + csel x3, x3, xzr, cc __LF \ + subs x8, x8, x3 __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbc x11, x11, xzr __LF \ + and x11, x11, #0x7fffffffffffffff __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] __LF \ + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + bic x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + mul x11, x7, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adc x15, x15, xzr __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mov x3, #0x26 __LF \ + mul x7, x3, x12 __LF \ + umulh x4, x3, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, x3, x13 __LF \ + umulh x13, x3, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, x3, x14 __LF \ + umulh x14, x3, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, x3, x6 __LF \ + umulh x6, x3, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + cmn x11, x11 __LF \ + bic x11, x11, #0x8000000000000000 __LF \ + adc x2, x12, x12 __LF \ + mov x3, #0x13 __LF \ + mul x7, x3, x2 __LF \ + adds x8, x8, x7 __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adc x11, x11, xzr __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Plain 4-digit add without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result + +#define add_4(p0,p1,p2) \ + ldp x0, x1, [p1] __LF \ + ldp x4, x5, [p2] __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + ldp x2, x3, [p1+16] __LF \ + ldp x6, x7, [p2+16] __LF \ + adcs x2, x2, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [p0] __LF \ + stp x2, x3, [p0+16] + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(p0,p1,p2) \ + ldp x5, x6, [p1] __LF \ + ldp x4, x3, [p2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [p1+16] __LF \ + ldp x4, x3, [p2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x3, #19 __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + mov x4, #0x8000000000000000 __LF \ + sbc x8, x8, x4 __LF \ + stp x5, x6, [p0] __LF \ + stp x7, x8, [p0+16] + +// Modular addition with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(p0,p1,p2) \ + ldp x5, x6, [p1] __LF \ + ldp x4, x3, [p2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [p1+16] __LF \ + ldp x4, x3, [p2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [p0] __LF \ + stp x7, x8, [p0+16] + +// Combined z = c * x + y with reduction only < 2 * p_25519 +// where c is initially in the X1 register. It is assumed +// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a +// high mul in the final part. + +#define cmadd_4(p0,p2,p3) \ + ldp x7, x8, [p2] __LF \ + ldp x9, x10, [p2+16] __LF \ + mul x3, x1, x7 __LF \ + mul x4, x1, x8 __LF \ + mul x5, x1, x9 __LF \ + mul x6, x1, x10 __LF \ + umulh x7, x1, x7 __LF \ + umulh x8, x1, x8 __LF \ + umulh x9, x1, x9 __LF \ + umulh x10, x1, x10 __LF \ + adds x4, x4, x7 __LF \ + adcs x5, x5, x8 __LF \ + adcs x6, x6, x9 __LF \ + adc x10, x10, xzr __LF \ + ldp x7, x8, [p3] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x7, x8, [p3+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + adc x10, x10, xzr __LF \ + cmn x6, x6 __LF \ + bic x6, x6, #0x8000000000000000 __LF \ + adc x8, x10, x10 __LF \ + mov x9, #19 __LF \ + mul x7, x8, x9 __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [p0] __LF \ + stp x5, x6, [p0+16] + +// Multiplex: z := if NZ then x else y + +#define mux_4(p0,p1,p2) \ + ldp x0, x1, [p1] __LF \ + ldp x2, x3, [p2] __LF \ + csel x0, x0, x2, ne __LF \ + csel x1, x1, x3, ne __LF \ + stp x0, x1, [p0] __LF \ + ldp x0, x1, [p1+16] __LF \ + ldp x2, x3, [p2+16] __LF \ + csel x0, x0, x2, ne __LF \ + csel x1, x1, x3, ne __LF \ + stp x0, x1, [p0+16] + +S2N_BN_SYMBOL(curve25519_pxscalarmul_alt): + +// Save regs and make room for temporaries + + stp x19, x22, [sp, -16]! + stp x20, x21, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the input arguments to stable places + + mov res, x0 + mov scalar, x1 + mov point, x2 + +// Initialize (xn,zn) = (1,0) and (xm,zm) = (x,1) with swap = 0 + + mov x2, #1 + stp x2, xzr, [xn] + stp xzr, xzr, [xn+16] + stp xzr, xzr, [zn] + stp xzr, xzr, [zn+16] + ldp x0, x1, [x] + stp x0, x1, [xm] + ldp x0, x1, [x+16] + stp x0, x1, [xm+16] + ldp x0, x1, [x+32] + stp x2, xzr, [zm] + stp xzr, xzr, [zm+16] + mov swap, xzr + +// The outer loop from i = 255, ..., i = 0 (inclusive) + + mov i, #255 + +curve25519_pxscalarmul_alt_loop: + +// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn +// The adds don't need any normalization as they're fed to muls +// Just make sure the subs fit in 4 digits + + sub_4(dm, xm, zm) + add_4(sn, xn, zn) + sub_4(dn, xn, zn) + add_4(sm, xm, zm) + +// ADDING: dmsn = dm * sn; dnsm = sm * dn +// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) + + mul_4(dmsn,sn,dm) + + lsr x0, i, #6 + ldr x2, [scalar, x0, lsl #3] + lsr x2, x2, i + and x2, x2, #1 + + cmp swap, x2 + mov swap, x2 + + mux_4(d,dm,dn) + mux_4(s,sm,sn) + + mul_4(dnsm,sm,dn) + +// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits + + sqr_4(d,d) + +// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 +// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits + + sub_twice4(dpro,dmsn,dnsm) + sqr_4(s,s) + add_twice4(spro,dmsn,dnsm) + sqr_4(dpro,dpro) + +// DOUBLING: p = 4 * xt * zt = s - d + + sub_twice4(p,s,d) + +// ADDING: xm' = (dmsn + dnsm)^2 + + sqr_p25519(xm,spro) + +// DOUBLING: e = 121666 * p + d + + mov x1, 0xdb42 + orr x1, x1, 0x10000 + cmadd_4(e,p,d) + +// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d + + mul_p25519(xn,s,d) + +// ADDING: zm' = x * (dmsn - dnsm)^2 + + mul_p25519(zm,dpro,x) + +// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) +// = p * (d + 121666 * p) + + mul_p25519(zn,p,e) + +// Loop down as far as 0 (inclusive) + + subs i, i, #1 + bcs curve25519_pxscalarmul_alt_loop + +// The main loop does not handle the special input of the 2-torsion +// point = (0,0). In that case we may get a spurious (0,0) as output +// when we want (0,1) [for odd scalar] or (1,0) [for even scalar]. +// Test if x = 0 (this is equivalent for curve25519 to y = 0) and if +// so, patch zm = 1 [for odd multiple], xn = 1 [for even multiple]. + + ldp x0, x1, [point] + orr x0, x0, x1 + ldp x2, x3, [point, #16] + orr x2, x2, x3 + orr x0, x0, x2 + cmp x0, xzr + cset x0, eq + ldr x1, [zm] + orr x1, x1, x0 + str x1, [zm] + ldr x2, [xn] + orr x2, x2, x0 + str x2, [xn] + +// Multiplex into the final outputs + + cmp swap, xzr + + mux_4(resx,xm,xn) + mux_4(resz,zm,zn) + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x20, x21, [sp], 16 + ldp x19, x22, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519.S similarity index 100% rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519.S diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_alt.S similarity index 69% rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_alt.S index 518cb895555..99c2bcced39 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_alt.S @@ -79,204 +79,204 @@ // trivially different from a pure function call to that subroutine. #define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - mul x12, x3, x7; \ - umulh x13, x3, x7; \ - mul x11, x3, x8; \ - umulh x14, x3, x8; \ - adds x13, x13, x11; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x9; \ - umulh x15, x3, x9; \ - adcs x14, x14, x11; \ - mul x11, x3, x10; \ - umulh x16, x3, x10; \ - adcs x15, x15, x11; \ - adc x16, x16, xzr; \ - ldp x5, x6, [P1+16]; \ - mul x11, x4, x7; \ - adds x13, x13, x11; \ - mul x11, x4, x8; \ - adcs x14, x14, x11; \ - mul x11, x4, x9; \ - adcs x15, x15, x11; \ - mul x11, x4, x10; \ - adcs x16, x16, x11; \ - umulh x3, x4, x10; \ - adc x3, x3, xzr; \ - umulh x11, x4, x7; \ - adds x14, x14, x11; \ - umulh x11, x4, x8; \ - adcs x15, x15, x11; \ - umulh x11, x4, x9; \ - adcs x16, x16, x11; \ - adc x3, x3, xzr; \ - mul x11, x5, x7; \ - adds x14, x14, x11; \ - mul x11, x5, x8; \ - adcs x15, x15, x11; \ - mul x11, x5, x9; \ - adcs x16, x16, x11; \ - mul x11, x5, x10; \ - adcs x3, x3, x11; \ - umulh x4, x5, x10; \ - adc x4, x4, xzr; \ - umulh x11, x5, x7; \ - adds x15, x15, x11; \ - umulh x11, x5, x8; \ - adcs x16, x16, x11; \ - umulh x11, x5, x9; \ - adcs x3, x3, x11; \ - adc x4, x4, xzr; \ - mul x11, x6, x7; \ - adds x15, x15, x11; \ - mul x11, x6, x8; \ - adcs x16, x16, x11; \ - mul x11, x6, x9; \ - adcs x3, x3, x11; \ - mul x11, x6, x10; \ - adcs x4, x4, x11; \ - umulh x5, x6, x10; \ - adc x5, x5, xzr; \ - umulh x11, x6, x7; \ - adds x16, x16, x11; \ - umulh x11, x6, x8; \ - adcs x3, x3, x11; \ - umulh x11, x6, x9; \ - adcs x4, x4, x11; \ - adc x5, x5, xzr; \ - mov x7, #0x26; \ - mul x11, x7, x16; \ - umulh x9, x7, x16; \ - adds x12, x12, x11; \ - mul x11, x7, x3; \ - umulh x3, x7, x3; \ - adcs x13, x13, x11; \ - mul x11, x7, x4; \ - umulh x4, x7, x4; \ - adcs x14, x14, x11; \ - mul x11, x7, x5; \ - umulh x5, x7, x5; \ - adcs x15, x15, x11; \ - cset x16, cs; \ - adds x15, x15, x4; \ - adc x16, x16, x5; \ - cmn x15, x15; \ - orr x15, x15, #0x8000000000000000; \ - adc x8, x16, x16; \ - mov x7, #0x13; \ - madd x11, x7, x8, x7; \ - adds x12, x12, x11; \ - adcs x13, x13, x9; \ - adcs x14, x14, x3; \ - adcs x15, x15, xzr; \ - csel x7, x7, xzr, cc; \ - subs x12, x12, x7; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbc x15, x15, xzr; \ - and x15, x15, #0x7fffffffffffffff; \ - stp x12, x13, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + orr x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + madd x11, x7, x8, x7 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x15, x15, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + and x15, x15, #0x7fffffffffffffff __LF \ + stp x12, x13, [P0] __LF \ stp x14, x15, [P0+16] // A version of multiplication that only guarantees output < 2 * p_25519. // This basically skips the +1 and final correction in quotient estimation. #define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - mul x12, x3, x7; \ - umulh x13, x3, x7; \ - mul x11, x3, x8; \ - umulh x14, x3, x8; \ - adds x13, x13, x11; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x9; \ - umulh x15, x3, x9; \ - adcs x14, x14, x11; \ - mul x11, x3, x10; \ - umulh x16, x3, x10; \ - adcs x15, x15, x11; \ - adc x16, x16, xzr; \ - ldp x5, x6, [P1+16]; \ - mul x11, x4, x7; \ - adds x13, x13, x11; \ - mul x11, x4, x8; \ - adcs x14, x14, x11; \ - mul x11, x4, x9; \ - adcs x15, x15, x11; \ - mul x11, x4, x10; \ - adcs x16, x16, x11; \ - umulh x3, x4, x10; \ - adc x3, x3, xzr; \ - umulh x11, x4, x7; \ - adds x14, x14, x11; \ - umulh x11, x4, x8; \ - adcs x15, x15, x11; \ - umulh x11, x4, x9; \ - adcs x16, x16, x11; \ - adc x3, x3, xzr; \ - mul x11, x5, x7; \ - adds x14, x14, x11; \ - mul x11, x5, x8; \ - adcs x15, x15, x11; \ - mul x11, x5, x9; \ - adcs x16, x16, x11; \ - mul x11, x5, x10; \ - adcs x3, x3, x11; \ - umulh x4, x5, x10; \ - adc x4, x4, xzr; \ - umulh x11, x5, x7; \ - adds x15, x15, x11; \ - umulh x11, x5, x8; \ - adcs x16, x16, x11; \ - umulh x11, x5, x9; \ - adcs x3, x3, x11; \ - adc x4, x4, xzr; \ - mul x11, x6, x7; \ - adds x15, x15, x11; \ - mul x11, x6, x8; \ - adcs x16, x16, x11; \ - mul x11, x6, x9; \ - adcs x3, x3, x11; \ - mul x11, x6, x10; \ - adcs x4, x4, x11; \ - umulh x5, x6, x10; \ - adc x5, x5, xzr; \ - umulh x11, x6, x7; \ - adds x16, x16, x11; \ - umulh x11, x6, x8; \ - adcs x3, x3, x11; \ - umulh x11, x6, x9; \ - adcs x4, x4, x11; \ - adc x5, x5, xzr; \ - mov x7, #0x26; \ - mul x11, x7, x16; \ - umulh x9, x7, x16; \ - adds x12, x12, x11; \ - mul x11, x7, x3; \ - umulh x3, x7, x3; \ - adcs x13, x13, x11; \ - mul x11, x7, x4; \ - umulh x4, x7, x4; \ - adcs x14, x14, x11; \ - mul x11, x7, x5; \ - umulh x5, x7, x5; \ - adcs x15, x15, x11; \ - cset x16, cs; \ - adds x15, x15, x4; \ - adc x16, x16, x5; \ - cmn x15, x15; \ - bic x15, x15, #0x8000000000000000; \ - adc x8, x16, x16; \ - mov x7, #0x13; \ - mul x11, x7, x8; \ - adds x12, x12, x11; \ - adcs x13, x13, x9; \ - adcs x14, x14, x3; \ - adc x15, x15, xzr; \ - stp x12, x13, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + bic x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + mul x11, x7, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adc x15, x15, xzr __LF \ + stp x12, x13, [P0] __LF \ stp x14, x15, [P0+16] // Squaring just giving a result < 2 * p_25519, which is done by @@ -284,77 +284,77 @@ // optional correction. #define sqr_4(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x11, x2, x5; \ - umulh x12, x2, x5; \ - mul x7, x2, x4; \ - umulh x6, x2, x4; \ - adds x10, x10, x7; \ - adcs x11, x11, x6; \ - mul x7, x3, x4; \ - umulh x6, x3, x4; \ - adc x6, x6, xzr; \ - adds x11, x11, x7; \ - mul x13, x4, x5; \ - umulh x14, x4, x5; \ - adcs x12, x12, x6; \ - mul x7, x3, x5; \ - umulh x6, x3, x5; \ - adc x6, x6, xzr; \ - adds x12, x12, x7; \ - adcs x13, x13, x6; \ - adc x14, x14, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - cset x6, cs; \ - umulh x7, x2, x2; \ - mul x8, x2, x2; \ - adds x9, x9, x7; \ - mul x7, x3, x3; \ - adcs x10, x10, x7; \ - umulh x7, x3, x3; \ - adcs x11, x11, x7; \ - mul x7, x4, x4; \ - adcs x12, x12, x7; \ - umulh x7, x4, x4; \ - adcs x13, x13, x7; \ - mul x7, x5, x5; \ - adcs x14, x14, x7; \ - umulh x7, x5, x5; \ - adc x6, x6, x7; \ - mov x3, #0x26; \ - mul x7, x3, x12; \ - umulh x4, x3, x12; \ - adds x8, x8, x7; \ - mul x7, x3, x13; \ - umulh x13, x3, x13; \ - adcs x9, x9, x7; \ - mul x7, x3, x14; \ - umulh x14, x3, x14; \ - adcs x10, x10, x7; \ - mul x7, x3, x6; \ - umulh x6, x3, x6; \ - adcs x11, x11, x7; \ - cset x12, cs; \ - adds x11, x11, x14; \ - adc x12, x12, x6; \ - cmn x11, x11; \ - bic x11, x11, #0x8000000000000000; \ - adc x2, x12, x12; \ - mov x3, #0x13; \ - mul x7, x3, x2; \ - adds x8, x8, x7; \ - adcs x9, x9, x4; \ - adcs x10, x10, x13; \ - adc x11, x11, xzr; \ - stp x8, x9, [P0]; \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mov x3, #0x26 __LF \ + mul x7, x3, x12 __LF \ + umulh x4, x3, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, x3, x13 __LF \ + umulh x13, x3, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, x3, x14 __LF \ + umulh x14, x3, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, x3, x6 __LF \ + umulh x6, x3, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + cmn x11, x11 __LF \ + bic x11, x11, #0x8000000000000000 __LF \ + adc x2, x12, x12 __LF \ + mov x3, #0x13 __LF \ + mul x7, x3, x2 __LF \ + adds x8, x8, x7 __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adc x11, x11, xzr __LF \ + stp x8, x9, [P0] __LF \ stp x10, x11, [P0+16] // Modular addition with double modulus 2 * p_25519 = 2^256 - 38. @@ -364,41 +364,41 @@ // at least one of them is reduced double modulo. #define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 #define sub_twice4(p0,p1,p2) \ - ldp x5, x6, [p1]; \ - ldp x4, x3, [p2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [p1+16]; \ - ldp x4, x3, [p2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [p0]; \ + ldp x5, x6, [p1] __LF \ + ldp x4, x3, [p2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [p1+16] __LF \ + ldp x4, x3, [p2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [p0] __LF \ stp x7, x8, [p0+16] // Combined z = c * x + y with reduction only < 2 * p_25519 @@ -407,51 +407,51 @@ // high mul in the final part. #define cmadd_4(p0,p2,p3) \ - ldp x7, x8, [p2]; \ - ldp x9, x10, [p2+16]; \ - mul x3, x1, x7; \ - mul x4, x1, x8; \ - mul x5, x1, x9; \ - mul x6, x1, x10; \ - umulh x7, x1, x7; \ - umulh x8, x1, x8; \ - umulh x9, x1, x9; \ - umulh x10, x1, x10; \ - adds x4, x4, x7; \ - adcs x5, x5, x8; \ - adcs x6, x6, x9; \ - adc x10, x10, xzr; \ - ldp x7, x8, [p3]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x7, x8, [p3+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - adc x10, x10, xzr; \ - cmn x6, x6; \ - bic x6, x6, #0x8000000000000000; \ - adc x8, x10, x10; \ - mov x9, #19; \ - mul x7, x8, x9; \ - adds x3, x3, x7; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [p0]; \ + ldp x7, x8, [p2] __LF \ + ldp x9, x10, [p2+16] __LF \ + mul x3, x1, x7 __LF \ + mul x4, x1, x8 __LF \ + mul x5, x1, x9 __LF \ + mul x6, x1, x10 __LF \ + umulh x7, x1, x7 __LF \ + umulh x8, x1, x8 __LF \ + umulh x9, x1, x9 __LF \ + umulh x10, x1, x10 __LF \ + adds x4, x4, x7 __LF \ + adcs x5, x5, x8 __LF \ + adcs x6, x6, x9 __LF \ + adc x10, x10, xzr __LF \ + ldp x7, x8, [p3] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x7, x8, [p3+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + adc x10, x10, xzr __LF \ + cmn x6, x6 __LF \ + bic x6, x6, #0x8000000000000000 __LF \ + adc x8, x10, x10 __LF \ + mov x9, #19 __LF \ + mul x7, x8, x9 __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [p0] __LF \ stp x5, x6, [p0+16] // Multiplex: z := if NZ then x else y #define mux_4(p0,p1,p2) \ - ldp x0, x1, [p1]; \ - ldp x2, x3, [p2]; \ - csel x0, x0, x2, ne; \ - csel x1, x1, x3, ne; \ - stp x0, x1, [p0]; \ - ldp x0, x1, [p1+16]; \ - ldp x2, x3, [p2+16]; \ - csel x0, x0, x2, ne; \ - csel x1, x1, x3, ne; \ + ldp x0, x1, [p1] __LF \ + ldp x2, x3, [p2] __LF \ + csel x0, x0, x2, ne __LF \ + csel x1, x1, x3, ne __LF \ + stp x0, x1, [p0] __LF \ + ldp x0, x1, [p1+16] __LF \ + ldp x2, x3, [p2+16] __LF \ + csel x0, x0, x2, ne __LF \ + csel x1, x1, x3, ne __LF \ stp x0, x1, [p0+16] S2N_BN_SYMBOL(curve25519_x25519_alt): diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_byte.S similarity index 100% rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_byte.S diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_byte_alt.S similarity index 72% rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_byte_alt.S index 511e2960bd3..fc71df70903 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519_byte_alt.S @@ -79,204 +79,204 @@ // trivially different from a pure function call to that subroutine. #define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - mul x12, x3, x7; \ - umulh x13, x3, x7; \ - mul x11, x3, x8; \ - umulh x14, x3, x8; \ - adds x13, x13, x11; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x9; \ - umulh x15, x3, x9; \ - adcs x14, x14, x11; \ - mul x11, x3, x10; \ - umulh x16, x3, x10; \ - adcs x15, x15, x11; \ - adc x16, x16, xzr; \ - ldp x5, x6, [P1+16]; \ - mul x11, x4, x7; \ - adds x13, x13, x11; \ - mul x11, x4, x8; \ - adcs x14, x14, x11; \ - mul x11, x4, x9; \ - adcs x15, x15, x11; \ - mul x11, x4, x10; \ - adcs x16, x16, x11; \ - umulh x3, x4, x10; \ - adc x3, x3, xzr; \ - umulh x11, x4, x7; \ - adds x14, x14, x11; \ - umulh x11, x4, x8; \ - adcs x15, x15, x11; \ - umulh x11, x4, x9; \ - adcs x16, x16, x11; \ - adc x3, x3, xzr; \ - mul x11, x5, x7; \ - adds x14, x14, x11; \ - mul x11, x5, x8; \ - adcs x15, x15, x11; \ - mul x11, x5, x9; \ - adcs x16, x16, x11; \ - mul x11, x5, x10; \ - adcs x3, x3, x11; \ - umulh x4, x5, x10; \ - adc x4, x4, xzr; \ - umulh x11, x5, x7; \ - adds x15, x15, x11; \ - umulh x11, x5, x8; \ - adcs x16, x16, x11; \ - umulh x11, x5, x9; \ - adcs x3, x3, x11; \ - adc x4, x4, xzr; \ - mul x11, x6, x7; \ - adds x15, x15, x11; \ - mul x11, x6, x8; \ - adcs x16, x16, x11; \ - mul x11, x6, x9; \ - adcs x3, x3, x11; \ - mul x11, x6, x10; \ - adcs x4, x4, x11; \ - umulh x5, x6, x10; \ - adc x5, x5, xzr; \ - umulh x11, x6, x7; \ - adds x16, x16, x11; \ - umulh x11, x6, x8; \ - adcs x3, x3, x11; \ - umulh x11, x6, x9; \ - adcs x4, x4, x11; \ - adc x5, x5, xzr; \ - mov x7, #0x26; \ - mul x11, x7, x16; \ - umulh x9, x7, x16; \ - adds x12, x12, x11; \ - mul x11, x7, x3; \ - umulh x3, x7, x3; \ - adcs x13, x13, x11; \ - mul x11, x7, x4; \ - umulh x4, x7, x4; \ - adcs x14, x14, x11; \ - mul x11, x7, x5; \ - umulh x5, x7, x5; \ - adcs x15, x15, x11; \ - cset x16, cs; \ - adds x15, x15, x4; \ - adc x16, x16, x5; \ - cmn x15, x15; \ - orr x15, x15, #0x8000000000000000; \ - adc x8, x16, x16; \ - mov x7, #0x13; \ - madd x11, x7, x8, x7; \ - adds x12, x12, x11; \ - adcs x13, x13, x9; \ - adcs x14, x14, x3; \ - adcs x15, x15, xzr; \ - csel x7, x7, xzr, cc; \ - subs x12, x12, x7; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbc x15, x15, xzr; \ - and x15, x15, #0x7fffffffffffffff; \ - stp x12, x13, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + orr x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + madd x11, x7, x8, x7 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x15, x15, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + and x15, x15, #0x7fffffffffffffff __LF \ + stp x12, x13, [P0] __LF \ stp x14, x15, [P0+16] // A version of multiplication that only guarantees output < 2 * p_25519. // This basically skips the +1 and final correction in quotient estimation. #define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - mul x12, x3, x7; \ - umulh x13, x3, x7; \ - mul x11, x3, x8; \ - umulh x14, x3, x8; \ - adds x13, x13, x11; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x9; \ - umulh x15, x3, x9; \ - adcs x14, x14, x11; \ - mul x11, x3, x10; \ - umulh x16, x3, x10; \ - adcs x15, x15, x11; \ - adc x16, x16, xzr; \ - ldp x5, x6, [P1+16]; \ - mul x11, x4, x7; \ - adds x13, x13, x11; \ - mul x11, x4, x8; \ - adcs x14, x14, x11; \ - mul x11, x4, x9; \ - adcs x15, x15, x11; \ - mul x11, x4, x10; \ - adcs x16, x16, x11; \ - umulh x3, x4, x10; \ - adc x3, x3, xzr; \ - umulh x11, x4, x7; \ - adds x14, x14, x11; \ - umulh x11, x4, x8; \ - adcs x15, x15, x11; \ - umulh x11, x4, x9; \ - adcs x16, x16, x11; \ - adc x3, x3, xzr; \ - mul x11, x5, x7; \ - adds x14, x14, x11; \ - mul x11, x5, x8; \ - adcs x15, x15, x11; \ - mul x11, x5, x9; \ - adcs x16, x16, x11; \ - mul x11, x5, x10; \ - adcs x3, x3, x11; \ - umulh x4, x5, x10; \ - adc x4, x4, xzr; \ - umulh x11, x5, x7; \ - adds x15, x15, x11; \ - umulh x11, x5, x8; \ - adcs x16, x16, x11; \ - umulh x11, x5, x9; \ - adcs x3, x3, x11; \ - adc x4, x4, xzr; \ - mul x11, x6, x7; \ - adds x15, x15, x11; \ - mul x11, x6, x8; \ - adcs x16, x16, x11; \ - mul x11, x6, x9; \ - adcs x3, x3, x11; \ - mul x11, x6, x10; \ - adcs x4, x4, x11; \ - umulh x5, x6, x10; \ - adc x5, x5, xzr; \ - umulh x11, x6, x7; \ - adds x16, x16, x11; \ - umulh x11, x6, x8; \ - adcs x3, x3, x11; \ - umulh x11, x6, x9; \ - adcs x4, x4, x11; \ - adc x5, x5, xzr; \ - mov x7, #0x26; \ - mul x11, x7, x16; \ - umulh x9, x7, x16; \ - adds x12, x12, x11; \ - mul x11, x7, x3; \ - umulh x3, x7, x3; \ - adcs x13, x13, x11; \ - mul x11, x7, x4; \ - umulh x4, x7, x4; \ - adcs x14, x14, x11; \ - mul x11, x7, x5; \ - umulh x5, x7, x5; \ - adcs x15, x15, x11; \ - cset x16, cs; \ - adds x15, x15, x4; \ - adc x16, x16, x5; \ - cmn x15, x15; \ - bic x15, x15, #0x8000000000000000; \ - adc x8, x16, x16; \ - mov x7, #0x13; \ - mul x11, x7, x8; \ - adds x12, x12, x11; \ - adcs x13, x13, x9; \ - adcs x14, x14, x3; \ - adc x15, x15, xzr; \ - stp x12, x13, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + bic x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + mul x11, x7, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adc x15, x15, xzr __LF \ + stp x12, x13, [P0] __LF \ stp x14, x15, [P0+16] // Squaring just giving a result < 2 * p_25519, which is done by @@ -284,77 +284,77 @@ // optional correction. #define sqr_4(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x11, x2, x5; \ - umulh x12, x2, x5; \ - mul x7, x2, x4; \ - umulh x6, x2, x4; \ - adds x10, x10, x7; \ - adcs x11, x11, x6; \ - mul x7, x3, x4; \ - umulh x6, x3, x4; \ - adc x6, x6, xzr; \ - adds x11, x11, x7; \ - mul x13, x4, x5; \ - umulh x14, x4, x5; \ - adcs x12, x12, x6; \ - mul x7, x3, x5; \ - umulh x6, x3, x5; \ - adc x6, x6, xzr; \ - adds x12, x12, x7; \ - adcs x13, x13, x6; \ - adc x14, x14, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - cset x6, cs; \ - umulh x7, x2, x2; \ - mul x8, x2, x2; \ - adds x9, x9, x7; \ - mul x7, x3, x3; \ - adcs x10, x10, x7; \ - umulh x7, x3, x3; \ - adcs x11, x11, x7; \ - mul x7, x4, x4; \ - adcs x12, x12, x7; \ - umulh x7, x4, x4; \ - adcs x13, x13, x7; \ - mul x7, x5, x5; \ - adcs x14, x14, x7; \ - umulh x7, x5, x5; \ - adc x6, x6, x7; \ - mov x3, #0x26; \ - mul x7, x3, x12; \ - umulh x4, x3, x12; \ - adds x8, x8, x7; \ - mul x7, x3, x13; \ - umulh x13, x3, x13; \ - adcs x9, x9, x7; \ - mul x7, x3, x14; \ - umulh x14, x3, x14; \ - adcs x10, x10, x7; \ - mul x7, x3, x6; \ - umulh x6, x3, x6; \ - adcs x11, x11, x7; \ - cset x12, cs; \ - adds x11, x11, x14; \ - adc x12, x12, x6; \ - cmn x11, x11; \ - bic x11, x11, #0x8000000000000000; \ - adc x2, x12, x12; \ - mov x3, #0x13; \ - mul x7, x3, x2; \ - adds x8, x8, x7; \ - adcs x9, x9, x4; \ - adcs x10, x10, x13; \ - adc x11, x11, xzr; \ - stp x8, x9, [P0]; \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mov x3, #0x26 __LF \ + mul x7, x3, x12 __LF \ + umulh x4, x3, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, x3, x13 __LF \ + umulh x13, x3, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, x3, x14 __LF \ + umulh x14, x3, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, x3, x6 __LF \ + umulh x6, x3, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + cmn x11, x11 __LF \ + bic x11, x11, #0x8000000000000000 __LF \ + adc x2, x12, x12 __LF \ + mov x3, #0x13 __LF \ + mul x7, x3, x2 __LF \ + adds x8, x8, x7 __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adc x11, x11, xzr __LF \ + stp x8, x9, [P0] __LF \ stp x10, x11, [P0+16] // Modular addition with double modulus 2 * p_25519 = 2^256 - 38. @@ -364,41 +364,41 @@ // at least one of them is reduced double modulo. #define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 #define sub_twice4(p0,p1,p2) \ - ldp x5, x6, [p1]; \ - ldp x4, x3, [p2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [p1+16]; \ - ldp x4, x3, [p2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [p0]; \ + ldp x5, x6, [p1] __LF \ + ldp x4, x3, [p2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [p1+16] __LF \ + ldp x4, x3, [p2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [p0] __LF \ stp x7, x8, [p0+16] // Combined z = c * x + y with reduction only < 2 * p_25519 @@ -407,51 +407,51 @@ // high mul in the final part. #define cmadd_4(p0,p2,p3) \ - ldp x7, x8, [p2]; \ - ldp x9, x10, [p2+16]; \ - mul x3, x1, x7; \ - mul x4, x1, x8; \ - mul x5, x1, x9; \ - mul x6, x1, x10; \ - umulh x7, x1, x7; \ - umulh x8, x1, x8; \ - umulh x9, x1, x9; \ - umulh x10, x1, x10; \ - adds x4, x4, x7; \ - adcs x5, x5, x8; \ - adcs x6, x6, x9; \ - adc x10, x10, xzr; \ - ldp x7, x8, [p3]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x7, x8, [p3+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - adc x10, x10, xzr; \ - cmn x6, x6; \ - bic x6, x6, #0x8000000000000000; \ - adc x8, x10, x10; \ - mov x9, #19; \ - mul x7, x8, x9; \ - adds x3, x3, x7; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [p0]; \ + ldp x7, x8, [p2] __LF \ + ldp x9, x10, [p2+16] __LF \ + mul x3, x1, x7 __LF \ + mul x4, x1, x8 __LF \ + mul x5, x1, x9 __LF \ + mul x6, x1, x10 __LF \ + umulh x7, x1, x7 __LF \ + umulh x8, x1, x8 __LF \ + umulh x9, x1, x9 __LF \ + umulh x10, x1, x10 __LF \ + adds x4, x4, x7 __LF \ + adcs x5, x5, x8 __LF \ + adcs x6, x6, x9 __LF \ + adc x10, x10, xzr __LF \ + ldp x7, x8, [p3] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x7, x8, [p3+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + adc x10, x10, xzr __LF \ + cmn x6, x6 __LF \ + bic x6, x6, #0x8000000000000000 __LF \ + adc x8, x10, x10 __LF \ + mov x9, #19 __LF \ + mul x7, x8, x9 __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [p0] __LF \ stp x5, x6, [p0+16] // Multiplex: z := if NZ then x else y #define mux_4(p0,p1,p2) \ - ldp x0, x1, [p1]; \ - ldp x2, x3, [p2]; \ - csel x0, x0, x2, ne; \ - csel x1, x1, x3, ne; \ - stp x0, x1, [p0]; \ - ldp x0, x1, [p1+16]; \ - ldp x2, x3, [p2+16]; \ - csel x0, x0, x2, ne; \ - csel x1, x1, x3, ne; \ + ldp x0, x1, [p1] __LF \ + ldp x2, x3, [p2] __LF \ + csel x0, x0, x2, ne __LF \ + csel x1, x1, x3, ne __LF \ + stp x0, x1, [p0] __LF \ + ldp x0, x1, [p1+16] __LF \ + ldp x2, x3, [p2+16] __LF \ + csel x0, x0, x2, ne __LF \ + csel x1, x1, x3, ne __LF \ stp x0, x1, [p0+16] S2N_BN_SYMBOL(curve25519_x25519_byte_alt): diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base.S similarity index 92% rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519base.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base.S index ef46f7b169e..748032a8ece 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base.S @@ -78,382 +78,382 @@ // trivially different from a pure function call to that subroutine. #define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umaddl x5, w5, w0, x5; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - mov x3, #0x13; \ - tst x10, #0x8000000000000000; \ - csel x3, x3, xzr, pl; \ - subs x7, x7, x3; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - and x10, x10, #0x7fffffffffffffff; \ - stp x7, x8, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umaddl x5, w5, w0, x5 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + mov x3, #0x13 __LF \ + tst x10, #0x8000000000000000 __LF \ + csel x3, x3, xzr, pl __LF \ + subs x7, x7, x3 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + and x10, x10, #0x7fffffffffffffff __LF \ + stp x7, x8, [P0] __LF \ stp x9, x10, [P0+16] // A version of multiplication that only guarantees output < 2 * p_25519. // This basically skips the +1 and final correction in quotient estimation. #define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umull x5, w5, w0; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - stp x7, x8, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umull x5, w5, w0 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + stp x7, x8, [P0] __LF \ stp x9, x10, [P0+16] // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 #define sub_twice4(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [P0]; \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ stp x7, x8, [P0+16] // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. @@ -463,37 +463,37 @@ // at least one of them is reduced double modulo. #define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] #define double_twice4(P0,P1) \ - ldp x3, x4, [P1]; \ - adds x3, x3, x3; \ - adcs x4, x4, x4; \ - ldp x5, x6, [P1+16]; \ - adcs x5, x5, x5; \ - adcs x6, x6, x6; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + adds x3, x3, x3 __LF \ + adcs x4, x4, x4 __LF \ + ldp x5, x6, [P1+16] __LF \ + adcs x5, x5, x5 __LF \ + adcs x6, x6, x6 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] S2N_BN_SYMBOL(curve25519_x25519base): diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_alt.S similarity index 95% rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_alt.S index 702fe6e88aa..e834548f91c 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_alt.S @@ -78,224 +78,224 @@ // trivially different from a pure function call to that subroutine. #define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - mul x12, x3, x7; \ - umulh x13, x3, x7; \ - mul x11, x3, x8; \ - umulh x14, x3, x8; \ - adds x13, x13, x11; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x9; \ - umulh x15, x3, x9; \ - adcs x14, x14, x11; \ - mul x11, x3, x10; \ - umulh x16, x3, x10; \ - adcs x15, x15, x11; \ - adc x16, x16, xzr; \ - ldp x5, x6, [P1+16]; \ - mul x11, x4, x7; \ - adds x13, x13, x11; \ - mul x11, x4, x8; \ - adcs x14, x14, x11; \ - mul x11, x4, x9; \ - adcs x15, x15, x11; \ - mul x11, x4, x10; \ - adcs x16, x16, x11; \ - umulh x3, x4, x10; \ - adc x3, x3, xzr; \ - umulh x11, x4, x7; \ - adds x14, x14, x11; \ - umulh x11, x4, x8; \ - adcs x15, x15, x11; \ - umulh x11, x4, x9; \ - adcs x16, x16, x11; \ - adc x3, x3, xzr; \ - mul x11, x5, x7; \ - adds x14, x14, x11; \ - mul x11, x5, x8; \ - adcs x15, x15, x11; \ - mul x11, x5, x9; \ - adcs x16, x16, x11; \ - mul x11, x5, x10; \ - adcs x3, x3, x11; \ - umulh x4, x5, x10; \ - adc x4, x4, xzr; \ - umulh x11, x5, x7; \ - adds x15, x15, x11; \ - umulh x11, x5, x8; \ - adcs x16, x16, x11; \ - umulh x11, x5, x9; \ - adcs x3, x3, x11; \ - adc x4, x4, xzr; \ - mul x11, x6, x7; \ - adds x15, x15, x11; \ - mul x11, x6, x8; \ - adcs x16, x16, x11; \ - mul x11, x6, x9; \ - adcs x3, x3, x11; \ - mul x11, x6, x10; \ - adcs x4, x4, x11; \ - umulh x5, x6, x10; \ - adc x5, x5, xzr; \ - umulh x11, x6, x7; \ - adds x16, x16, x11; \ - umulh x11, x6, x8; \ - adcs x3, x3, x11; \ - umulh x11, x6, x9; \ - adcs x4, x4, x11; \ - adc x5, x5, xzr; \ - mov x7, #0x26; \ - mul x11, x7, x16; \ - umulh x9, x7, x16; \ - adds x12, x12, x11; \ - mul x11, x7, x3; \ - umulh x3, x7, x3; \ - adcs x13, x13, x11; \ - mul x11, x7, x4; \ - umulh x4, x7, x4; \ - adcs x14, x14, x11; \ - mul x11, x7, x5; \ - umulh x5, x7, x5; \ - adcs x15, x15, x11; \ - cset x16, cs; \ - adds x15, x15, x4; \ - adc x16, x16, x5; \ - cmn x15, x15; \ - orr x15, x15, #0x8000000000000000; \ - adc x8, x16, x16; \ - mov x7, #0x13; \ - madd x11, x7, x8, x7; \ - adds x12, x12, x11; \ - adcs x13, x13, x9; \ - adcs x14, x14, x3; \ - adcs x15, x15, xzr; \ - csel x7, x7, xzr, cc; \ - subs x12, x12, x7; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbc x15, x15, xzr; \ - and x15, x15, #0x7fffffffffffffff; \ - stp x12, x13, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + orr x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + madd x11, x7, x8, x7 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x15, x15, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + and x15, x15, #0x7fffffffffffffff __LF \ + stp x12, x13, [P0] __LF \ stp x14, x15, [P0+16] // A version of multiplication that only guarantees output < 2 * p_25519. // This basically skips the +1 and final correction in quotient estimation. #define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - mul x12, x3, x7; \ - umulh x13, x3, x7; \ - mul x11, x3, x8; \ - umulh x14, x3, x8; \ - adds x13, x13, x11; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x9; \ - umulh x15, x3, x9; \ - adcs x14, x14, x11; \ - mul x11, x3, x10; \ - umulh x16, x3, x10; \ - adcs x15, x15, x11; \ - adc x16, x16, xzr; \ - ldp x5, x6, [P1+16]; \ - mul x11, x4, x7; \ - adds x13, x13, x11; \ - mul x11, x4, x8; \ - adcs x14, x14, x11; \ - mul x11, x4, x9; \ - adcs x15, x15, x11; \ - mul x11, x4, x10; \ - adcs x16, x16, x11; \ - umulh x3, x4, x10; \ - adc x3, x3, xzr; \ - umulh x11, x4, x7; \ - adds x14, x14, x11; \ - umulh x11, x4, x8; \ - adcs x15, x15, x11; \ - umulh x11, x4, x9; \ - adcs x16, x16, x11; \ - adc x3, x3, xzr; \ - mul x11, x5, x7; \ - adds x14, x14, x11; \ - mul x11, x5, x8; \ - adcs x15, x15, x11; \ - mul x11, x5, x9; \ - adcs x16, x16, x11; \ - mul x11, x5, x10; \ - adcs x3, x3, x11; \ - umulh x4, x5, x10; \ - adc x4, x4, xzr; \ - umulh x11, x5, x7; \ - adds x15, x15, x11; \ - umulh x11, x5, x8; \ - adcs x16, x16, x11; \ - umulh x11, x5, x9; \ - adcs x3, x3, x11; \ - adc x4, x4, xzr; \ - mul x11, x6, x7; \ - adds x15, x15, x11; \ - mul x11, x6, x8; \ - adcs x16, x16, x11; \ - mul x11, x6, x9; \ - adcs x3, x3, x11; \ - mul x11, x6, x10; \ - adcs x4, x4, x11; \ - umulh x5, x6, x10; \ - adc x5, x5, xzr; \ - umulh x11, x6, x7; \ - adds x16, x16, x11; \ - umulh x11, x6, x8; \ - adcs x3, x3, x11; \ - umulh x11, x6, x9; \ - adcs x4, x4, x11; \ - adc x5, x5, xzr; \ - mov x7, #0x26; \ - mul x11, x7, x16; \ - umulh x9, x7, x16; \ - adds x12, x12, x11; \ - mul x11, x7, x3; \ - umulh x3, x7, x3; \ - adcs x13, x13, x11; \ - mul x11, x7, x4; \ - umulh x4, x7, x4; \ - adcs x14, x14, x11; \ - mul x11, x7, x5; \ - umulh x5, x7, x5; \ - adcs x15, x15, x11; \ - cset x16, cs; \ - adds x15, x15, x4; \ - adc x16, x16, x5; \ - cmn x15, x15; \ - bic x15, x15, #0x8000000000000000; \ - adc x8, x16, x16; \ - mov x7, #0x13; \ - mul x11, x7, x8; \ - adds x12, x12, x11; \ - adcs x13, x13, x9; \ - adcs x14, x14, x3; \ - adc x15, x15, xzr; \ - stp x12, x13, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + bic x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + mul x11, x7, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adc x15, x15, xzr __LF \ + stp x12, x13, [P0] __LF \ stp x14, x15, [P0+16] // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 #define sub_twice4(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [P0]; \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ stp x7, x8, [P0+16] // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. @@ -305,37 +305,37 @@ // at least one of them is reduced double modulo. #define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] #define double_twice4(P0,P1) \ - ldp x3, x4, [P1]; \ - adds x3, x3, x3; \ - adcs x4, x4, x4; \ - ldp x5, x6, [P1+16]; \ - adcs x5, x5, x5; \ - adcs x6, x6, x6; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + adds x3, x3, x3 __LF \ + adcs x4, x4, x4 __LF \ + ldp x5, x6, [P1+16] __LF \ + adcs x5, x5, x5 __LF \ + adcs x6, x6, x6 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] S2N_BN_SYMBOL(curve25519_x25519base_alt): diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_byte.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_byte.S similarity index 93% rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_byte.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_byte.S index 635729cb77a..82eb0986151 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_byte.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_byte.S @@ -78,382 +78,382 @@ // trivially different from a pure function call to that subroutine. #define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umaddl x5, w5, w0, x5; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - mov x3, #0x13; \ - tst x10, #0x8000000000000000; \ - csel x3, x3, xzr, pl; \ - subs x7, x7, x3; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - and x10, x10, #0x7fffffffffffffff; \ - stp x7, x8, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umaddl x5, w5, w0, x5 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + mov x3, #0x13 __LF \ + tst x10, #0x8000000000000000 __LF \ + csel x3, x3, xzr, pl __LF \ + subs x7, x7, x3 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + and x10, x10, #0x7fffffffffffffff __LF \ + stp x7, x8, [P0] __LF \ stp x9, x10, [P0+16] // A version of multiplication that only guarantees output < 2 * p_25519. // This basically skips the +1 and final correction in quotient estimation. #define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umull x5, w5, w0; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - stp x7, x8, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umull x5, w5, w0 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + stp x7, x8, [P0] __LF \ stp x9, x10, [P0+16] // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 #define sub_twice4(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [P0]; \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ stp x7, x8, [P0+16] // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. @@ -463,37 +463,37 @@ // at least one of them is reduced double modulo. #define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] #define double_twice4(P0,P1) \ - ldp x3, x4, [P1]; \ - adds x3, x3, x3; \ - adcs x4, x4, x4; \ - ldp x5, x6, [P1+16]; \ - adcs x5, x5, x5; \ - adcs x6, x6, x6; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + adds x3, x3, x3 __LF \ + adcs x4, x4, x4 __LF \ + ldp x5, x6, [P1+16] __LF \ + adcs x5, x5, x5 __LF \ + adcs x6, x6, x6 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] S2N_BN_SYMBOL(curve25519_x25519base_byte): diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_byte_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_byte_alt.S similarity index 95% rename from third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_byte_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_byte_alt.S index 39b6bfd1724..b3062b6837d 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519base_byte_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/curve25519_x25519base_byte_alt.S @@ -78,224 +78,224 @@ // trivially different from a pure function call to that subroutine. #define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - mul x12, x3, x7; \ - umulh x13, x3, x7; \ - mul x11, x3, x8; \ - umulh x14, x3, x8; \ - adds x13, x13, x11; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x9; \ - umulh x15, x3, x9; \ - adcs x14, x14, x11; \ - mul x11, x3, x10; \ - umulh x16, x3, x10; \ - adcs x15, x15, x11; \ - adc x16, x16, xzr; \ - ldp x5, x6, [P1+16]; \ - mul x11, x4, x7; \ - adds x13, x13, x11; \ - mul x11, x4, x8; \ - adcs x14, x14, x11; \ - mul x11, x4, x9; \ - adcs x15, x15, x11; \ - mul x11, x4, x10; \ - adcs x16, x16, x11; \ - umulh x3, x4, x10; \ - adc x3, x3, xzr; \ - umulh x11, x4, x7; \ - adds x14, x14, x11; \ - umulh x11, x4, x8; \ - adcs x15, x15, x11; \ - umulh x11, x4, x9; \ - adcs x16, x16, x11; \ - adc x3, x3, xzr; \ - mul x11, x5, x7; \ - adds x14, x14, x11; \ - mul x11, x5, x8; \ - adcs x15, x15, x11; \ - mul x11, x5, x9; \ - adcs x16, x16, x11; \ - mul x11, x5, x10; \ - adcs x3, x3, x11; \ - umulh x4, x5, x10; \ - adc x4, x4, xzr; \ - umulh x11, x5, x7; \ - adds x15, x15, x11; \ - umulh x11, x5, x8; \ - adcs x16, x16, x11; \ - umulh x11, x5, x9; \ - adcs x3, x3, x11; \ - adc x4, x4, xzr; \ - mul x11, x6, x7; \ - adds x15, x15, x11; \ - mul x11, x6, x8; \ - adcs x16, x16, x11; \ - mul x11, x6, x9; \ - adcs x3, x3, x11; \ - mul x11, x6, x10; \ - adcs x4, x4, x11; \ - umulh x5, x6, x10; \ - adc x5, x5, xzr; \ - umulh x11, x6, x7; \ - adds x16, x16, x11; \ - umulh x11, x6, x8; \ - adcs x3, x3, x11; \ - umulh x11, x6, x9; \ - adcs x4, x4, x11; \ - adc x5, x5, xzr; \ - mov x7, #0x26; \ - mul x11, x7, x16; \ - umulh x9, x7, x16; \ - adds x12, x12, x11; \ - mul x11, x7, x3; \ - umulh x3, x7, x3; \ - adcs x13, x13, x11; \ - mul x11, x7, x4; \ - umulh x4, x7, x4; \ - adcs x14, x14, x11; \ - mul x11, x7, x5; \ - umulh x5, x7, x5; \ - adcs x15, x15, x11; \ - cset x16, cs; \ - adds x15, x15, x4; \ - adc x16, x16, x5; \ - cmn x15, x15; \ - orr x15, x15, #0x8000000000000000; \ - adc x8, x16, x16; \ - mov x7, #0x13; \ - madd x11, x7, x8, x7; \ - adds x12, x12, x11; \ - adcs x13, x13, x9; \ - adcs x14, x14, x3; \ - adcs x15, x15, xzr; \ - csel x7, x7, xzr, cc; \ - subs x12, x12, x7; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbc x15, x15, xzr; \ - and x15, x15, #0x7fffffffffffffff; \ - stp x12, x13, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + orr x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + madd x11, x7, x8, x7 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x15, x15, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + and x15, x15, #0x7fffffffffffffff __LF \ + stp x12, x13, [P0] __LF \ stp x14, x15, [P0+16] // A version of multiplication that only guarantees output < 2 * p_25519. // This basically skips the +1 and final correction in quotient estimation. #define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - mul x12, x3, x7; \ - umulh x13, x3, x7; \ - mul x11, x3, x8; \ - umulh x14, x3, x8; \ - adds x13, x13, x11; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x9; \ - umulh x15, x3, x9; \ - adcs x14, x14, x11; \ - mul x11, x3, x10; \ - umulh x16, x3, x10; \ - adcs x15, x15, x11; \ - adc x16, x16, xzr; \ - ldp x5, x6, [P1+16]; \ - mul x11, x4, x7; \ - adds x13, x13, x11; \ - mul x11, x4, x8; \ - adcs x14, x14, x11; \ - mul x11, x4, x9; \ - adcs x15, x15, x11; \ - mul x11, x4, x10; \ - adcs x16, x16, x11; \ - umulh x3, x4, x10; \ - adc x3, x3, xzr; \ - umulh x11, x4, x7; \ - adds x14, x14, x11; \ - umulh x11, x4, x8; \ - adcs x15, x15, x11; \ - umulh x11, x4, x9; \ - adcs x16, x16, x11; \ - adc x3, x3, xzr; \ - mul x11, x5, x7; \ - adds x14, x14, x11; \ - mul x11, x5, x8; \ - adcs x15, x15, x11; \ - mul x11, x5, x9; \ - adcs x16, x16, x11; \ - mul x11, x5, x10; \ - adcs x3, x3, x11; \ - umulh x4, x5, x10; \ - adc x4, x4, xzr; \ - umulh x11, x5, x7; \ - adds x15, x15, x11; \ - umulh x11, x5, x8; \ - adcs x16, x16, x11; \ - umulh x11, x5, x9; \ - adcs x3, x3, x11; \ - adc x4, x4, xzr; \ - mul x11, x6, x7; \ - adds x15, x15, x11; \ - mul x11, x6, x8; \ - adcs x16, x16, x11; \ - mul x11, x6, x9; \ - adcs x3, x3, x11; \ - mul x11, x6, x10; \ - adcs x4, x4, x11; \ - umulh x5, x6, x10; \ - adc x5, x5, xzr; \ - umulh x11, x6, x7; \ - adds x16, x16, x11; \ - umulh x11, x6, x8; \ - adcs x3, x3, x11; \ - umulh x11, x6, x9; \ - adcs x4, x4, x11; \ - adc x5, x5, xzr; \ - mov x7, #0x26; \ - mul x11, x7, x16; \ - umulh x9, x7, x16; \ - adds x12, x12, x11; \ - mul x11, x7, x3; \ - umulh x3, x7, x3; \ - adcs x13, x13, x11; \ - mul x11, x7, x4; \ - umulh x4, x7, x4; \ - adcs x14, x14, x11; \ - mul x11, x7, x5; \ - umulh x5, x7, x5; \ - adcs x15, x15, x11; \ - cset x16, cs; \ - adds x15, x15, x4; \ - adc x16, x16, x5; \ - cmn x15, x15; \ - bic x15, x15, #0x8000000000000000; \ - adc x8, x16, x16; \ - mov x7, #0x13; \ - mul x11, x7, x8; \ - adds x12, x12, x11; \ - adcs x13, x13, x9; \ - adcs x14, x14, x3; \ - adc x15, x15, xzr; \ - stp x12, x13, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + bic x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + mul x11, x7, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adc x15, x15, xzr __LF \ + stp x12, x13, [P0] __LF \ stp x14, x15, [P0+16] // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 #define sub_twice4(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [P0]; \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ stp x7, x8, [P0+16] // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. @@ -305,37 +305,37 @@ // at least one of them is reduced double modulo. #define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] #define double_twice4(P0,P1) \ - ldp x3, x4, [P1]; \ - adds x3, x3, x3; \ - adcs x4, x4, x4; \ - ldp x5, x6, [P1+16]; \ - adcs x5, x5, x5; \ - adcs x6, x6, x6; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + adds x3, x3, x3 __LF \ + adcs x4, x4, x4 __LF \ + ldp x5, x6, [P1+16] __LF \ + adcs x5, x5, x5 __LF \ + adcs x6, x6, x6 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] S2N_BN_SYMBOL(curve25519_x25519base_byte_alt): diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_decode.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_decode.S similarity index 96% rename from third_party/s2n-bignum/arm/curve25519/edwards25519_decode.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_decode.S index f565df90fd1..715662b1c9e 100644 --- a/third_party/s2n-bignum/arm/curve25519/edwards25519_decode.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_decode.S @@ -59,23 +59,23 @@ // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 // Macros wrapping up calls to the local subroutines #define mulp(dest,src1,src2) \ - add x0, dest; \ - add x1, src1; \ - add x2, src2; \ + add x0, dest __LF \ + add x1, src1 __LF \ + add x2, src2 __LF \ bl edwards25519_decode_mul_p25519 #define nsqr(dest,n,src) \ - add x0, dest; \ - mov x1, n; \ - add x2, src; \ + add x0, dest __LF \ + mov x1, n __LF \ + add x2, src __LF \ bl edwards25519_decode_nsqr_p25519 S2N_BN_SYMBOL(edwards25519_decode): diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_decode_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_decode_alt.S similarity index 95% rename from third_party/s2n-bignum/arm/curve25519/edwards25519_decode_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_decode_alt.S index befacd2ff01..79743f73b03 100644 --- a/third_party/s2n-bignum/arm/curve25519/edwards25519_decode_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_decode_alt.S @@ -59,23 +59,23 @@ // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 // Macros wrapping up calls to the local subroutines #define mulp(dest,src1,src2) \ - add x0, dest; \ - add x1, src1; \ - add x2, src2; \ + add x0, dest __LF \ + add x1, src1 __LF \ + add x2, src2 __LF \ bl edwards25519_decode_alt_mul_p25519 #define nsqr(dest,n,src) \ - add x0, dest; \ - mov x1, n; \ - add x2, src; \ + add x0, dest __LF \ + mov x1, n __LF \ + add x2, src __LF \ bl edwards25519_decode_alt_nsqr_p25519 S2N_BN_SYMBOL(edwards25519_decode_alt): diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_encode.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_encode.S similarity index 100% rename from third_party/s2n-bignum/arm/curve25519/edwards25519_encode.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_encode.S diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epadd.S new file mode 100644 index 00000000000..ee94ffc3370 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epadd.S @@ -0,0 +1,588 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended projective addition for edwards25519 +// Inputs p1[16], p2[16]; output p3[16] +// +// extern void edwards25519_epadd +// (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16]) +// +// The output p3 and both inputs p1 and p2 are points (x,y) on +// edwards25519 represented in extended projective quadruples (X,Y,Z,T) +// where x = X / Z, y = Y / Z and x * y = T / Z. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epadd) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define p3 x17 +#define p1 x19 +#define p2 x20 + +// Pointers to input and output coordinates + +#define x_1 p1, #0 +#define y_1 p1, #NUMSIZE +#define z_1 p1, #(2*NUMSIZE) +#define w_1 p1, #(3*NUMSIZE) + +#define x_2 p2, #0 +#define y_2 p2, #NUMSIZE +#define z_2 p2, #(2*NUMSIZE) +#define w_2 p2, #(3*NUMSIZE) + +#define x_3 p3, #0 +#define y_3 p3, #NUMSIZE +#define z_3 p3, #(2*NUMSIZE) +#define w_3 p3, #(3*NUMSIZE) + +// Pointer-offset pairs for temporaries on stack + +#define t0 sp, #(0*NUMSIZE) +#define t1 sp, #(1*NUMSIZE) +#define t2 sp, #(2*NUMSIZE) +#define t3 sp, #(3*NUMSIZE) +#define t4 sp, #(4*NUMSIZE) +#define t5 sp, #(5*NUMSIZE) + +// Total size to reserve on the stack + +#define NSPACE (6*NUMSIZE) + +// Macro wrapping up the basic field operation bignum_mul_p25519, only +// trivially different from a pure function call to that subroutine. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umaddl x5, w5, w0, x5 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + mov x3, #0x13 __LF \ + tst x10, #0x8000000000000000 __LF \ + csel x3, x3, xzr, pl __LF \ + subs x7, x7, x3 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + and x10, x10, #0x7fffffffffffffff __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umull x5, w5, w0 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +// Plain 4-digit add and doubling without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + ldp x0, x1, [P1] __LF \ + ldp x4, x5, [P2] __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + ldp x2, x3, [P1+16] __LF \ + ldp x6, x7, [P2+16] __LF \ + adcs x2, x2, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +#define double_4(P0,P1) \ + ldp x0, x1, [P1] __LF \ + adds x0, x0, x0 __LF \ + adcs x1, x1, x1 __LF \ + ldp x2, x3, [P1+16] __LF \ + adcs x2, x2, x2 __LF \ + adc x3, x3, x3 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x3, #19 __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + mov x4, #0x8000000000000000 __LF \ + sbc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38 +// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +// Load the constant k_25519 = 2 * d_25519 using immediate operations + +#define load_k25519(P0) \ + movz x0, #0xf159 __LF \ + movz x1, #0xb156 __LF \ + movz x2, #0xd130 __LF \ + movz x3, #0xfce7 __LF \ + movk x0, #0x26b2, lsl #16 __LF \ + movk x1, #0x8283, lsl #16 __LF \ + movk x2, #0xeef3, lsl #16 __LF \ + movk x3, #0x56df, lsl #16 __LF \ + movk x0, #0x9b94, lsl #32 __LF \ + movk x1, #0x149a, lsl #32 __LF \ + movk x2, #0x80f2, lsl #32 __LF \ + movk x3, #0xd9dc, lsl #32 __LF \ + movk x0, #0xebd6, lsl #48 __LF \ + movk x1, #0x00e0, lsl #48 __LF \ + movk x2, #0x198e, lsl #48 __LF \ + movk x3, #0x2406, lsl #48 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +S2N_BN_SYMBOL(edwards25519_epadd): + +// Save regs and make room for temporaries + + stp x19, x20, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the input arguments to stable places + + mov p3, x0 + mov p1, x1 + mov p2, x2 + +// Main sequence + + mul_4(t0,w_1,w_2) + + sub_4(t1,y_1,x_1) + sub_4(t2,y_2,x_2) + add_4(t3,y_1,x_1) + add_4(t4,y_2,x_2) + double_4(t5,z_2) + + mul_4(t1,t1,t2) + mul_4(t3,t3,t4) + + load_k25519(t2) + mul_4(t2,t2,t0) + + mul_4(t4,z_1,t5) + + sub_twice4(t0,t3,t1) + add_twice4(t5,t3,t1) + sub_twice4(t1,t4,t2) + add_twice4(t3,t4,t2) + + mul_p25519(w_3,t0,t5) + mul_p25519(x_3,t0,t1) + mul_p25519(y_3,t3,t5) + mul_p25519(z_3,t1,t3) + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epadd_alt.S new file mode 100644 index 00000000000..4324c25245f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epadd_alt.S @@ -0,0 +1,431 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended projective addition for edwards25519 +// Inputs p1[16], p2[16]; output p3[16] +// +// extern void edwards25519_epadd_alt +// (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16]) +// +// The output p3 and both inputs p1 and p2 are points (x,y) on +// edwards25519 represented in extended projective quadruples (X,Y,Z,T) +// where x = X / Z, y = Y / Z and x * y = T / Z. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define p3 x17 +#define p1 x19 +#define p2 x20 + +// Pointers to input and output coordinates + +#define x_1 p1, #0 +#define y_1 p1, #NUMSIZE +#define z_1 p1, #(2*NUMSIZE) +#define w_1 p1, #(3*NUMSIZE) + +#define x_2 p2, #0 +#define y_2 p2, #NUMSIZE +#define z_2 p2, #(2*NUMSIZE) +#define w_2 p2, #(3*NUMSIZE) + +#define x_3 p3, #0 +#define y_3 p3, #NUMSIZE +#define z_3 p3, #(2*NUMSIZE) +#define w_3 p3, #(3*NUMSIZE) + +// Pointer-offset pairs for temporaries on stack + +#define t0 sp, #(0*NUMSIZE) +#define t1 sp, #(1*NUMSIZE) +#define t2 sp, #(2*NUMSIZE) +#define t3 sp, #(3*NUMSIZE) +#define t4 sp, #(4*NUMSIZE) +#define t5 sp, #(5*NUMSIZE) + +// Total size to reserve on the stack + +#define NSPACE (6*NUMSIZE) + +// Macro wrapping up the basic field operation bignum_mul_p25519_alt, only +// trivially different from a pure function call to that subroutine. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + orr x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + madd x11, x7, x8, x7 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x15, x15, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + and x15, x15, #0x7fffffffffffffff __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + bic x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + mul x11, x7, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adc x15, x15, xzr __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] + +// Plain 4-digit add and doubling without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + ldp x0, x1, [P1] __LF \ + ldp x4, x5, [P2] __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + ldp x2, x3, [P1+16] __LF \ + ldp x6, x7, [P2+16] __LF \ + adcs x2, x2, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +#define double_4(P0,P1) \ + ldp x0, x1, [P1] __LF \ + adds x0, x0, x0 __LF \ + adcs x1, x1, x1 __LF \ + ldp x2, x3, [P1+16] __LF \ + adcs x2, x2, x2 __LF \ + adc x3, x3, x3 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x3, #19 __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + mov x4, #0x8000000000000000 __LF \ + sbc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38 +// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +// Load the constant k_25519 = 2 * d_25519 using immediate operations + +#define load_k25519(P0) \ + movz x0, #0xf159 __LF \ + movz x1, #0xb156 __LF \ + movz x2, #0xd130 __LF \ + movz x3, #0xfce7 __LF \ + movk x0, #0x26b2, lsl #16 __LF \ + movk x1, #0x8283, lsl #16 __LF \ + movk x2, #0xeef3, lsl #16 __LF \ + movk x3, #0x56df, lsl #16 __LF \ + movk x0, #0x9b94, lsl #32 __LF \ + movk x1, #0x149a, lsl #32 __LF \ + movk x2, #0x80f2, lsl #32 __LF \ + movk x3, #0xd9dc, lsl #32 __LF \ + movk x0, #0xebd6, lsl #48 __LF \ + movk x1, #0x00e0, lsl #48 __LF \ + movk x2, #0x198e, lsl #48 __LF \ + movk x3, #0x2406, lsl #48 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +S2N_BN_SYMBOL(edwards25519_epadd_alt): + +// Save regs and make room for temporaries + + stp x19, x20, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the input arguments to stable places + + mov p3, x0 + mov p1, x1 + mov p2, x2 + +// Main sequence + + mul_4(t0,w_1,w_2) + + sub_4(t1,y_1,x_1) + sub_4(t2,y_2,x_2) + add_4(t3,y_1,x_1) + add_4(t4,y_2,x_2) + double_4(t5,z_2) + + mul_4(t1,t1,t2) + mul_4(t3,t3,t4) + + load_k25519(t2) + mul_4(t2,t2,t0) + + mul_4(t4,z_1,t5) + + sub_twice4(t0,t3,t1) + add_twice4(t5,t3,t1) + sub_twice4(t1,t4,t2) + add_twice4(t3,t4,t2) + + mul_p25519(w_3,t0,t5) + mul_p25519(x_3,t0,t1) + mul_p25519(y_3,t3,t5) + mul_p25519(z_3,t1,t3) + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epdouble.S new file mode 100644 index 00000000000..13ac61219b9 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epdouble.S @@ -0,0 +1,494 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended projective doubling for edwards25519 +// Input p1[12]; output p3[16] +// +// extern void edwards25519_epdouble +// (uint64_t p3[static 16],uint64_t p1[static 12]) +// +// If p1 is a point on edwards25519, returns its double p3 = 2 * p1. +// The output p3 is in extended projective coordinates, representing +// affine (x,y) by a quadruple (X,Y,Z,T) where x = X / Z, y = Y / Z +// and x * y = T / Z. The input p1 may also be in the same extended +// projective representation, but the final T field is not used so +// a more basic projective triple (X,Y,Z) suffices. +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epdouble) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define p3 x17 +#define p1 x19 + +// Pointers to input and output coordinates + +#define x_1 p1, #0 +#define y_1 p1, #NUMSIZE +#define z_1 p1, #(2*NUMSIZE) + +#define x_3 p3, #0 +#define y_3 p3, #NUMSIZE +#define z_3 p3, #(2*NUMSIZE) +#define w_3 p3, #(3*NUMSIZE) + +// Pointer-offset pairs for temporaries on stack + +#define t0 sp, #(0*NUMSIZE) +#define t1 sp, #(1*NUMSIZE) +#define t2 sp, #(2*NUMSIZE) +#define t3 sp, #(3*NUMSIZE) +#define t4 sp, #(4*NUMSIZE) + +// Total size to reserve on the stack + +#define NSPACE (5*NUMSIZE) + +// Macro wrapping up the basic field operation bignum_mul_p25519, only +// trivially different from a pure function call to that subroutine. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umaddl x5, w5, w0, x5 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + mov x3, #0x13 __LF \ + tst x10, #0x8000000000000000 __LF \ + csel x3, x3, xzr, pl __LF \ + subs x7, x7, x3 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + and x10, x10, #0x7fffffffffffffff __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + ldp x10, x11, [P1] __LF \ + ldp x12, x13, [P1+16] __LF \ + umull x2, w10, w10 __LF \ + lsr x14, x10, #32 __LF \ + umull x3, w14, w14 __LF \ + umull x14, w10, w14 __LF \ + adds x2, x2, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x3, x3, x14 __LF \ + umull x4, w11, w11 __LF \ + lsr x14, x11, #32 __LF \ + umull x5, w14, w14 __LF \ + umull x14, w11, w14 __LF \ + mul x15, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x4, x4, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x5, x5, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x5, x5, xzr __LF \ + adds x3, x3, x15 __LF \ + adcs x4, x4, x16 __LF \ + adc x5, x5, xzr __LF \ + umull x6, w12, w12 __LF \ + lsr x14, x12, #32 __LF \ + umull x7, w14, w14 __LF \ + umull x14, w12, w14 __LF \ + adds x6, x6, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x7, x7, x14 __LF \ + umull x8, w13, w13 __LF \ + lsr x14, x13, #32 __LF \ + umull x9, w14, w14 __LF \ + umull x14, w13, w14 __LF \ + mul x15, x12, x13 __LF \ + umulh x16, x12, x13 __LF \ + adds x8, x8, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x9, x9, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x9, x9, xzr __LF \ + adds x7, x7, x15 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, xzr __LF \ + subs x10, x10, x12 __LF \ + sbcs x11, x11, x13 __LF \ + csetm x16, cc __LF \ + eor x10, x10, x16 __LF \ + subs x10, x10, x16 __LF \ + eor x11, x11, x16 __LF \ + sbc x11, x11, x16 __LF \ + adds x6, x6, x4 __LF \ + adcs x7, x7, x5 __LF \ + adcs x8, x8, xzr __LF \ + adc x9, x9, xzr __LF \ + umull x12, w10, w10 __LF \ + lsr x5, x10, #32 __LF \ + umull x13, w5, w5 __LF \ + umull x5, w10, w5 __LF \ + adds x12, x12, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x13, x13, x5 __LF \ + umull x15, w11, w11 __LF \ + lsr x5, x11, #32 __LF \ + umull x14, w5, w5 __LF \ + umull x5, w11, w5 __LF \ + mul x4, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x15, x15, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x14, x14, x5 __LF \ + adds x4, x4, x4 __LF \ + adcs x16, x16, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x13, x13, x4 __LF \ + adcs x15, x15, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x4, x2, x6 __LF \ + adcs x5, x3, x7 __LF \ + adcs x6, x6, x8 __LF \ + adcs x7, x7, x9 __LF \ + csetm x16, cc __LF \ + subs x4, x4, x12 __LF \ + sbcs x5, x5, x13 __LF \ + sbcs x6, x6, x15 __LF \ + sbcs x7, x7, x14 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, x16 __LF \ + mov x10, #0x26 __LF \ + umull x12, w6, w10 __LF \ + add x12, x12, w2, uxtw __LF \ + lsr x2, x2, #32 __LF \ + lsr x6, x6, #32 __LF \ + umaddl x6, w6, w10, x2 __LF \ + mov x2, x12 __LF \ + umull x12, w7, w10 __LF \ + add x12, x12, w3, uxtw __LF \ + lsr x3, x3, #32 __LF \ + lsr x7, x7, #32 __LF \ + umaddl x7, w7, w10, x3 __LF \ + mov x3, x12 __LF \ + umull x12, w8, w10 __LF \ + add x12, x12, w4, uxtw __LF \ + lsr x4, x4, #32 __LF \ + lsr x8, x8, #32 __LF \ + umaddl x8, w8, w10, x4 __LF \ + mov x4, x12 __LF \ + umull x12, w9, w10 __LF \ + add x12, x12, w5, uxtw __LF \ + lsr x5, x5, #32 __LF \ + lsr x9, x9, #32 __LF \ + umaddl x9, w9, w10, x5 __LF \ + mov x5, x12 __LF \ + lsr x13, x9, #31 __LF \ + mov x11, #0x13 __LF \ + umull x11, w11, w13 __LF \ + add x2, x2, x11 __LF \ + adds x2, x2, x6, lsl #32 __LF \ + extr x10, x7, x6, #32 __LF \ + adcs x3, x3, x10 __LF \ + extr x10, x8, x7, #32 __LF \ + adcs x4, x4, x10 __LF \ + extr x10, x9, x8, #32 __LF \ + lsl x11, x13, #63 __LF \ + eor x5, x5, x11 __LF \ + adc x5, x5, x10 __LF \ + stp x2, x3, [P0] __LF \ + stp x4, x5, [P0+16] + +// Plain 4-digit adding without any normalization. +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + ldp x0, x1, [P1] __LF \ + ldp x4, x5, [P2] __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + ldp x2, x3, [P1+16] __LF \ + ldp x6, x7, [P2+16] __LF \ + adcs x2, x2, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +#define double_twice4(P0,P1) \ + ldp x3, x4, [P1] __LF \ + adds x3, x3, x3 __LF \ + adcs x4, x4, x4 __LF \ + ldp x5, x6, [P1+16] __LF \ + adcs x5, x5, x5 __LF \ + adcs x6, x6, x6 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +S2N_BN_SYMBOL(edwards25519_epdouble): + +// Save regs and make room for temporaries + + stp x19, x20, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the input arguments to stable places + + mov p3, x0 + mov p1, x1 + +// Main sequence + + add_4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_p25519(y_3,t2,t4) + mul_p25519(z_3,t3,t2) + mul_p25519(w_3,t1,t4) + mul_p25519(x_3,t1,t3) + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epdouble_alt.S new file mode 100644 index 00000000000..c6b9332c09c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_epdouble_alt.S @@ -0,0 +1,357 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended projective doubling for edwards25519 +// Input p1[12]; output p3[16] +// +// extern void edwards25519_epdouble +// (uint64_t p3[static 16],uint64_t p1[static 12]) +// +// If p1 is a point on edwards25519, returns its double p3 = 2 * p1. +// The output p3 is in extended projective coordinates, representing +// affine (x,y) by a quadruple (X,Y,Z,T) where x = X / Z, y = Y / Z +// and x * y = T / Z. The input p1 may also be in the same extended +// projective representation, but the final T field is not used so +// a more basic projective triple (X,Y,Z) suffices. +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epdouble_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define p3 x17 +#define p1 x19 + +// Pointers to input and output coordinates + +#define x_1 p1, #0 +#define y_1 p1, #NUMSIZE +#define z_1 p1, #(2*NUMSIZE) + +#define x_3 p3, #0 +#define y_3 p3, #NUMSIZE +#define z_3 p3, #(2*NUMSIZE) +#define w_3 p3, #(3*NUMSIZE) + +// Pointer-offset pairs for temporaries on stack + +#define t0 sp, #(0*NUMSIZE) +#define t1 sp, #(1*NUMSIZE) +#define t2 sp, #(2*NUMSIZE) +#define t3 sp, #(3*NUMSIZE) +#define t4 sp, #(4*NUMSIZE) + +// Total size to reserve on the stack + +#define NSPACE (5*NUMSIZE) + +// Macro wrapping up the basic field operation bignum_mul_p25519_alt, only +// trivially different from a pure function call to that subroutine. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + orr x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + madd x11, x7, x8, x7 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x15, x15, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + and x15, x15, #0x7fffffffffffffff __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mov x3, #0x26 __LF \ + mul x7, x3, x12 __LF \ + umulh x4, x3, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, x3, x13 __LF \ + umulh x13, x3, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, x3, x14 __LF \ + umulh x14, x3, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, x3, x6 __LF \ + umulh x6, x3, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + cmn x11, x11 __LF \ + bic x11, x11, #0x8000000000000000 __LF \ + adc x2, x12, x12 __LF \ + mov x3, #0x13 __LF \ + mul x7, x3, x2 __LF \ + adds x8, x8, x7 __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adc x11, x11, xzr __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Plain 4-digit adding without any normalization. +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + ldp x0, x1, [P1] __LF \ + ldp x4, x5, [P2] __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + ldp x2, x3, [P1+16] __LF \ + ldp x6, x7, [P2+16] __LF \ + adcs x2, x2, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +#define double_twice4(P0,P1) \ + ldp x3, x4, [P1] __LF \ + adds x3, x3, x3 __LF \ + adcs x4, x4, x4 __LF \ + ldp x5, x6, [P1+16] __LF \ + adcs x5, x5, x5 __LF \ + adcs x6, x6, x6 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +S2N_BN_SYMBOL(edwards25519_epdouble_alt): + +// Save regs and make room for temporaries + + stp x19, x20, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the input arguments to stable places + + mov p3, x0 + mov p1, x1 + +// Main sequence + + add_4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_p25519(y_3,t2,t4) + mul_p25519(z_3,t3,t2) + mul_p25519(w_3,t1,t4) + mul_p25519(x_3,t1,t3) + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pdouble.S new file mode 100644 index 00000000000..c79ab204693 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pdouble.S @@ -0,0 +1,489 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Projective doubling for edwards25519 +// Input p1[12]; output p3[12] +// +// extern void edwards25519_pdouble +// (uint64_t p3[static 12],uint64_t p1[static 12]) +// +// If p1 is a point on edwards25519, returns its double p3 = 2 * p1. +// Input and output are in pure projective coordinates, representing +// an affine (x,y) by a triple (X,Y,Z) where x = X / Z, y = Y / Z. +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pdouble) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define p3 x17 +#define p1 x19 + +// Pointers to input and output coordinates + +#define x_1 p1, #0 +#define y_1 p1, #NUMSIZE +#define z_1 p1, #(2*NUMSIZE) + +#define x_3 p3, #0 +#define y_3 p3, #NUMSIZE +#define z_3 p3, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries on stack + +#define t0 sp, #(0*NUMSIZE) +#define t1 sp, #(1*NUMSIZE) +#define t2 sp, #(2*NUMSIZE) +#define t3 sp, #(3*NUMSIZE) +#define t4 sp, #(4*NUMSIZE) + +// Total size to reserve on the stack + +#define NSPACE (5*NUMSIZE) + +// Macro wrapping up the basic field operation bignum_mul_p25519, only +// trivially different from a pure function call to that subroutine. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umaddl x5, w5, w0, x5 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + mov x3, #0x13 __LF \ + tst x10, #0x8000000000000000 __LF \ + csel x3, x3, xzr, pl __LF \ + subs x7, x7, x3 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + and x10, x10, #0x7fffffffffffffff __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + ldp x10, x11, [P1] __LF \ + ldp x12, x13, [P1+16] __LF \ + umull x2, w10, w10 __LF \ + lsr x14, x10, #32 __LF \ + umull x3, w14, w14 __LF \ + umull x14, w10, w14 __LF \ + adds x2, x2, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x3, x3, x14 __LF \ + umull x4, w11, w11 __LF \ + lsr x14, x11, #32 __LF \ + umull x5, w14, w14 __LF \ + umull x14, w11, w14 __LF \ + mul x15, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x4, x4, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x5, x5, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x5, x5, xzr __LF \ + adds x3, x3, x15 __LF \ + adcs x4, x4, x16 __LF \ + adc x5, x5, xzr __LF \ + umull x6, w12, w12 __LF \ + lsr x14, x12, #32 __LF \ + umull x7, w14, w14 __LF \ + umull x14, w12, w14 __LF \ + adds x6, x6, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x7, x7, x14 __LF \ + umull x8, w13, w13 __LF \ + lsr x14, x13, #32 __LF \ + umull x9, w14, w14 __LF \ + umull x14, w13, w14 __LF \ + mul x15, x12, x13 __LF \ + umulh x16, x12, x13 __LF \ + adds x8, x8, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x9, x9, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x9, x9, xzr __LF \ + adds x7, x7, x15 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, xzr __LF \ + subs x10, x10, x12 __LF \ + sbcs x11, x11, x13 __LF \ + csetm x16, cc __LF \ + eor x10, x10, x16 __LF \ + subs x10, x10, x16 __LF \ + eor x11, x11, x16 __LF \ + sbc x11, x11, x16 __LF \ + adds x6, x6, x4 __LF \ + adcs x7, x7, x5 __LF \ + adcs x8, x8, xzr __LF \ + adc x9, x9, xzr __LF \ + umull x12, w10, w10 __LF \ + lsr x5, x10, #32 __LF \ + umull x13, w5, w5 __LF \ + umull x5, w10, w5 __LF \ + adds x12, x12, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x13, x13, x5 __LF \ + umull x15, w11, w11 __LF \ + lsr x5, x11, #32 __LF \ + umull x14, w5, w5 __LF \ + umull x5, w11, w5 __LF \ + mul x4, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x15, x15, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x14, x14, x5 __LF \ + adds x4, x4, x4 __LF \ + adcs x16, x16, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x13, x13, x4 __LF \ + adcs x15, x15, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x4, x2, x6 __LF \ + adcs x5, x3, x7 __LF \ + adcs x6, x6, x8 __LF \ + adcs x7, x7, x9 __LF \ + csetm x16, cc __LF \ + subs x4, x4, x12 __LF \ + sbcs x5, x5, x13 __LF \ + sbcs x6, x6, x15 __LF \ + sbcs x7, x7, x14 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, x16 __LF \ + mov x10, #0x26 __LF \ + umull x12, w6, w10 __LF \ + add x12, x12, w2, uxtw __LF \ + lsr x2, x2, #32 __LF \ + lsr x6, x6, #32 __LF \ + umaddl x6, w6, w10, x2 __LF \ + mov x2, x12 __LF \ + umull x12, w7, w10 __LF \ + add x12, x12, w3, uxtw __LF \ + lsr x3, x3, #32 __LF \ + lsr x7, x7, #32 __LF \ + umaddl x7, w7, w10, x3 __LF \ + mov x3, x12 __LF \ + umull x12, w8, w10 __LF \ + add x12, x12, w4, uxtw __LF \ + lsr x4, x4, #32 __LF \ + lsr x8, x8, #32 __LF \ + umaddl x8, w8, w10, x4 __LF \ + mov x4, x12 __LF \ + umull x12, w9, w10 __LF \ + add x12, x12, w5, uxtw __LF \ + lsr x5, x5, #32 __LF \ + lsr x9, x9, #32 __LF \ + umaddl x9, w9, w10, x5 __LF \ + mov x5, x12 __LF \ + lsr x13, x9, #31 __LF \ + mov x11, #0x13 __LF \ + umull x11, w11, w13 __LF \ + add x2, x2, x11 __LF \ + adds x2, x2, x6, lsl #32 __LF \ + extr x10, x7, x6, #32 __LF \ + adcs x3, x3, x10 __LF \ + extr x10, x8, x7, #32 __LF \ + adcs x4, x4, x10 __LF \ + extr x10, x9, x8, #32 __LF \ + lsl x11, x13, #63 __LF \ + eor x5, x5, x11 __LF \ + adc x5, x5, x10 __LF \ + stp x2, x3, [P0] __LF \ + stp x4, x5, [P0+16] + +// Plain 4-digit adding without any normalization. +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + ldp x0, x1, [P1] __LF \ + ldp x4, x5, [P2] __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + ldp x2, x3, [P1+16] __LF \ + ldp x6, x7, [P2+16] __LF \ + adcs x2, x2, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +#define double_twice4(P0,P1) \ + ldp x3, x4, [P1] __LF \ + adds x3, x3, x3 __LF \ + adcs x4, x4, x4 __LF \ + ldp x5, x6, [P1+16] __LF \ + adcs x5, x5, x5 __LF \ + adcs x6, x6, x6 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +S2N_BN_SYMBOL(edwards25519_pdouble): + +// Save regs and make room for temporaries + + stp x19, x20, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the input arguments to stable places + + mov p3, x0 + mov p1, x1 + +// Main sequence + + add_4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_p25519(y_3,t2,t4) + mul_p25519(z_3,t3,t2) + mul_p25519(x_3,t1,t3) + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pdouble_alt.S new file mode 100644 index 00000000000..8b9e75eb925 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pdouble_alt.S @@ -0,0 +1,352 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Projective doubling for edwards25519 +// Input p1[12]; output p3[12] +// +// extern void edwards25519_pdouble +// (uint64_t p3[static 12],uint64_t p1[static 12]) +// +// If p1 is a point on edwards25519, returns its double p3 = 2 * p1. +// Input and output are in pure projective coordinates, representing +// an affine (x,y) by a triple (X,Y,Z) where x = X / Z, y = Y / Z. +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pdouble_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define p3 x17 +#define p1 x19 + +// Pointers to input and output coordinates + +#define x_1 p1, #0 +#define y_1 p1, #NUMSIZE +#define z_1 p1, #(2*NUMSIZE) + +#define x_3 p3, #0 +#define y_3 p3, #NUMSIZE +#define z_3 p3, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries on stack + +#define t0 sp, #(0*NUMSIZE) +#define t1 sp, #(1*NUMSIZE) +#define t2 sp, #(2*NUMSIZE) +#define t3 sp, #(3*NUMSIZE) +#define t4 sp, #(4*NUMSIZE) + +// Total size to reserve on the stack + +#define NSPACE (5*NUMSIZE) + +// Macro wrapping up the basic field operation bignum_mul_p25519_alt, only +// trivially different from a pure function call to that subroutine. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + orr x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + madd x11, x7, x8, x7 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x15, x15, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + and x15, x15, #0x7fffffffffffffff __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mov x3, #0x26 __LF \ + mul x7, x3, x12 __LF \ + umulh x4, x3, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, x3, x13 __LF \ + umulh x13, x3, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, x3, x14 __LF \ + umulh x14, x3, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, x3, x6 __LF \ + umulh x6, x3, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + cmn x11, x11 __LF \ + bic x11, x11, #0x8000000000000000 __LF \ + adc x2, x12, x12 __LF \ + mov x3, #0x13 __LF \ + mul x7, x3, x2 __LF \ + adds x8, x8, x7 __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adc x11, x11, xzr __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Plain 4-digit adding without any normalization. +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + ldp x0, x1, [P1] __LF \ + ldp x4, x5, [P2] __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + ldp x2, x3, [P1+16] __LF \ + ldp x6, x7, [P2+16] __LF \ + adcs x2, x2, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +#define double_twice4(P0,P1) \ + ldp x3, x4, [P1] __LF \ + adds x3, x3, x3 __LF \ + adcs x4, x4, x4 __LF \ + ldp x5, x6, [P1+16] __LF \ + adcs x5, x5, x5 __LF \ + adcs x6, x6, x6 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +S2N_BN_SYMBOL(edwards25519_pdouble_alt): + +// Save regs and make room for temporaries + + stp x19, x20, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the input arguments to stable places + + mov p3, x0 + mov p1, x1 + +// Main sequence + + add_4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_p25519(y_3,t2,t4) + mul_p25519(z_3,t3,t2) + mul_p25519(x_3,t1,t3) + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pepadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pepadd.S new file mode 100644 index 00000000000..4fefdfc3693 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pepadd.S @@ -0,0 +1,562 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended projective + precomputed mixed addition for edwards25519 +// Inputs p1[16], p2[12]; output p3[16] +// +// extern void edwards25519_pepadd +// (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12]) +// +// The output p3 and the first input p1 are points (x,y) on edwards25519 +// represented in extended projective quadruples (X,Y,Z,T) where +// x = X / Z, y = Y / Z and x * y = T / Z. The second input p2 is a triple +// encoding its point (x,y) as (y - x,y + x,2 * d * x * y) where d is the +// usual Edwards curve parameter for edwards25519. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pepadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pepadd) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define p3 x17 +#define p1 x19 +#define p2 x20 + +// Pointers to input and output coordinates + +#define x_1 p1, #0 +#define y_1 p1, #NUMSIZE +#define z_1 p1, #(2*NUMSIZE) +#define w_1 p1, #(3*NUMSIZE) + +#define ymx_2 p2, #0 +#define xpy_2 p2, #NUMSIZE +#define kxy_2 p2, #(2*NUMSIZE) + +#define x_3 p3, #0 +#define y_3 p3, #NUMSIZE +#define z_3 p3, #(2*NUMSIZE) +#define w_3 p3, #(3*NUMSIZE) + +// Pointer-offset pairs for temporaries on stack + +#define t0 sp, #(0*NUMSIZE) +#define t1 sp, #(1*NUMSIZE) +#define t2 sp, #(2*NUMSIZE) +#define t3 sp, #(3*NUMSIZE) +#define t4 sp, #(4*NUMSIZE) +#define t5 sp, #(5*NUMSIZE) + +// Total size to reserve on the stack + +#define NSPACE (6*NUMSIZE) + +// Macro wrapping up the basic field operation bignum_mul_p25519, only +// trivially different from a pure function call to that subroutine. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umaddl x5, w5, w0, x5 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + mov x3, #0x13 __LF \ + tst x10, #0x8000000000000000 __LF \ + csel x3, x3, xzr, pl __LF \ + subs x7, x7, x3 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + and x10, x10, #0x7fffffffffffffff __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umull x5, w5, w0 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +// Plain 4-digit add and doubling without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + ldp x0, x1, [P1] __LF \ + ldp x4, x5, [P2] __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + ldp x2, x3, [P1+16] __LF \ + ldp x6, x7, [P2+16] __LF \ + adcs x2, x2, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +#define double_4(P0,P1) \ + ldp x0, x1, [P1] __LF \ + adds x0, x0, x0 __LF \ + adcs x1, x1, x1 __LF \ + ldp x2, x3, [P1+16] __LF \ + adcs x2, x2, x2 __LF \ + adc x3, x3, x3 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x3, #19 __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + mov x4, #0x8000000000000000 __LF \ + sbc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38 +// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +S2N_BN_SYMBOL(edwards25519_pepadd): + +// Save regs and make room for temporaries + + stp x19, x20, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the input arguments to stable places + + mov p3, x0 + mov p1, x1 + mov p2, x2 + +// Main sequence + + double_4(t0,z_1); + + sub_4(t1,y_1,x_1); + add_4(t2,y_1,x_1); + + mul_4(t3,w_1,kxy_2); + + mul_4(t1,t1,ymx_2); + mul_4(t2,t2,xpy_2); + + sub_twice4(t4,t0,t3); + add_twice4(t0,t0,t3); + sub_twice4(t5,t2,t1); + add_twice4(t1,t2,t1); + + mul_p25519(z_3,t4,t0); + mul_p25519(x_3,t5,t4); + mul_p25519(y_3,t0,t1); + mul_p25519(w_3,t5,t1); + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pepadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pepadd_alt.S new file mode 100644 index 00000000000..eb9d55f1adb --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_pepadd_alt.S @@ -0,0 +1,404 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended projective + precomputed mixed addition for edwards25519 +// Inputs p1[16], p2[12]; output p3[16] +// +// extern void edwards25519_pepadd_alt +// (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12]) +// +// The output p3 and the first input p1 are points (x,y) on edwards25519 +// represented in extended projective quadruples (X,Y,Z,T) where +// x = X / Z, y = Y / Z and x * y = T / Z. The second input p2 is a triple +// encoding its point (x,y) as (y - x,y + x,2 * d * x * y) where d is the +// usual Edwards curve parameter for edwards25519. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pepadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pepadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define p3 x17 +#define p1 x19 +#define p2 x20 + +// Pointers to input and output coordinates + +#define x_1 p1, #0 +#define y_1 p1, #NUMSIZE +#define z_1 p1, #(2*NUMSIZE) +#define w_1 p1, #(3*NUMSIZE) + +#define ymx_2 p2, #0 +#define xpy_2 p2, #NUMSIZE +#define kxy_2 p2, #(2*NUMSIZE) + +#define x_3 p3, #0 +#define y_3 p3, #NUMSIZE +#define z_3 p3, #(2*NUMSIZE) +#define w_3 p3, #(3*NUMSIZE) + +// Pointer-offset pairs for temporaries on stack + +#define t0 sp, #(0*NUMSIZE) +#define t1 sp, #(1*NUMSIZE) +#define t2 sp, #(2*NUMSIZE) +#define t3 sp, #(3*NUMSIZE) +#define t4 sp, #(4*NUMSIZE) +#define t5 sp, #(5*NUMSIZE) + +// Total size to reserve on the stack + +#define NSPACE (6*NUMSIZE) + +// Macro wrapping up the basic field operation bignum_mul_p25519_alt, only +// trivially different from a pure function call to that subroutine. + +#define mul_p25519(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + orr x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + madd x11, x7, x8, x7 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x15, x15, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + and x15, x15, #0x7fffffffffffffff __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + bic x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + mul x11, x7, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adc x15, x15, xzr __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] + +// Plain 4-digit add and doubling without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + ldp x0, x1, [P1] __LF \ + ldp x4, x5, [P2] __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + ldp x2, x3, [P1+16] __LF \ + ldp x6, x7, [P2+16] __LF \ + adcs x2, x2, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +#define double_4(P0,P1) \ + ldp x0, x1, [P1] __LF \ + adds x0, x0, x0 __LF \ + adcs x1, x1, x1 __LF \ + ldp x2, x3, [P1+16] __LF \ + adcs x2, x2, x2 __LF \ + adc x3, x3, x3 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x3, #19 __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + mov x4, #0x8000000000000000 __LF \ + sbc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38 +// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519. + +#define add_twice4(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +S2N_BN_SYMBOL(edwards25519_pepadd_alt): + +// Save regs and make room for temporaries + + stp x19, x20, [sp, -16]! + sub sp, sp, #NSPACE + +// Move the input arguments to stable places + + mov p3, x0 + mov p1, x1 + mov p2, x2 + +// Main sequence + + double_4(t0,z_1); + + sub_4(t1,y_1,x_1); + add_4(t2,y_1,x_1); + + mul_4(t3,w_1,kxy_2); + + mul_4(t1,t1,ymx_2); + mul_4(t2,t2,xpy_2); + + sub_twice4(t4,t0,t3); + add_twice4(t0,t0,t3); + sub_twice4(t5,t2,t1); + add_twice4(t1,t2,t1); + + mul_p25519(z_3,t4,t0); + mul_p25519(x_3,t5,t4); + mul_p25519(y_3,t0,t1); + mul_p25519(w_3,t5,t1); + +// Restore stack and registers + + add sp, sp, #NSPACE + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmulbase.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmulbase.S similarity index 92% rename from third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmulbase.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmulbase.S index e00aa7e278a..51be0c8427f 100644 --- a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmulbase.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmulbase.S @@ -77,391 +77,391 @@ // Load 64-bit immediate into a register #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 // Macro wrapping up the basic field operation bignum_mul_p25519, only // trivially different from a pure function call to that subroutine. #define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umaddl x5, w5, w0, x5; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - mov x3, #0x13; \ - tst x10, #0x8000000000000000; \ - csel x3, x3, xzr, pl; \ - subs x7, x7, x3; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - and x10, x10, #0x7fffffffffffffff; \ - stp x7, x8, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umaddl x5, w5, w0, x5 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + mov x3, #0x13 __LF \ + tst x10, #0x8000000000000000 __LF \ + csel x3, x3, xzr, pl __LF \ + subs x7, x7, x3 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + and x10, x10, #0x7fffffffffffffff __LF \ + stp x7, x8, [P0] __LF \ stp x9, x10, [P0+16] // A version of multiplication that only guarantees output < 2 * p_25519. // This basically skips the +1 and final correction in quotient estimation. #define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umull x5, w5, w0; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - stp x7, x8, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umull x5, w5, w0 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + stp x7, x8, [P0] __LF \ stp x9, x10, [P0+16] // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 #define sub_twice4(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [P0]; \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ stp x7, x8, [P0+16] // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. @@ -471,37 +471,37 @@ // at least one of them is reduced double modulo. #define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] #define double_twice4(P0,P1) \ - ldp x3, x4, [P1]; \ - adds x3, x3, x3; \ - adcs x4, x4, x4; \ - ldp x5, x6, [P1+16]; \ - adcs x5, x5, x5; \ - adcs x6, x6, x6; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + adds x3, x3, x3 __LF \ + adcs x4, x4, x4 __LF \ + ldp x5, x6, [P1+16] __LF \ + adcs x5, x5, x5 __LF \ + adcs x6, x6, x6 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] S2N_BN_SYMBOL(edwards25519_scalarmulbase): diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmulbase_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmulbase_alt.S similarity index 95% rename from third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmulbase_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmulbase_alt.S index 2ffc7799edb..726ae766dfc 100644 --- a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmulbase_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmulbase_alt.S @@ -77,233 +77,233 @@ // Load 64-bit immediate into a register #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 // Macro wrapping up the basic field operation bignum_mul_p25519_alt, only // trivially different from a pure function call to that subroutine. #define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - mul x12, x3, x7; \ - umulh x13, x3, x7; \ - mul x11, x3, x8; \ - umulh x14, x3, x8; \ - adds x13, x13, x11; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x9; \ - umulh x15, x3, x9; \ - adcs x14, x14, x11; \ - mul x11, x3, x10; \ - umulh x16, x3, x10; \ - adcs x15, x15, x11; \ - adc x16, x16, xzr; \ - ldp x5, x6, [P1+16]; \ - mul x11, x4, x7; \ - adds x13, x13, x11; \ - mul x11, x4, x8; \ - adcs x14, x14, x11; \ - mul x11, x4, x9; \ - adcs x15, x15, x11; \ - mul x11, x4, x10; \ - adcs x16, x16, x11; \ - umulh x3, x4, x10; \ - adc x3, x3, xzr; \ - umulh x11, x4, x7; \ - adds x14, x14, x11; \ - umulh x11, x4, x8; \ - adcs x15, x15, x11; \ - umulh x11, x4, x9; \ - adcs x16, x16, x11; \ - adc x3, x3, xzr; \ - mul x11, x5, x7; \ - adds x14, x14, x11; \ - mul x11, x5, x8; \ - adcs x15, x15, x11; \ - mul x11, x5, x9; \ - adcs x16, x16, x11; \ - mul x11, x5, x10; \ - adcs x3, x3, x11; \ - umulh x4, x5, x10; \ - adc x4, x4, xzr; \ - umulh x11, x5, x7; \ - adds x15, x15, x11; \ - umulh x11, x5, x8; \ - adcs x16, x16, x11; \ - umulh x11, x5, x9; \ - adcs x3, x3, x11; \ - adc x4, x4, xzr; \ - mul x11, x6, x7; \ - adds x15, x15, x11; \ - mul x11, x6, x8; \ - adcs x16, x16, x11; \ - mul x11, x6, x9; \ - adcs x3, x3, x11; \ - mul x11, x6, x10; \ - adcs x4, x4, x11; \ - umulh x5, x6, x10; \ - adc x5, x5, xzr; \ - umulh x11, x6, x7; \ - adds x16, x16, x11; \ - umulh x11, x6, x8; \ - adcs x3, x3, x11; \ - umulh x11, x6, x9; \ - adcs x4, x4, x11; \ - adc x5, x5, xzr; \ - mov x7, #0x26; \ - mul x11, x7, x16; \ - umulh x9, x7, x16; \ - adds x12, x12, x11; \ - mul x11, x7, x3; \ - umulh x3, x7, x3; \ - adcs x13, x13, x11; \ - mul x11, x7, x4; \ - umulh x4, x7, x4; \ - adcs x14, x14, x11; \ - mul x11, x7, x5; \ - umulh x5, x7, x5; \ - adcs x15, x15, x11; \ - cset x16, cs; \ - adds x15, x15, x4; \ - adc x16, x16, x5; \ - cmn x15, x15; \ - orr x15, x15, #0x8000000000000000; \ - adc x8, x16, x16; \ - mov x7, #0x13; \ - madd x11, x7, x8, x7; \ - adds x12, x12, x11; \ - adcs x13, x13, x9; \ - adcs x14, x14, x3; \ - adcs x15, x15, xzr; \ - csel x7, x7, xzr, cc; \ - subs x12, x12, x7; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbc x15, x15, xzr; \ - and x15, x15, #0x7fffffffffffffff; \ - stp x12, x13, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + orr x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + madd x11, x7, x8, x7 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x15, x15, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + and x15, x15, #0x7fffffffffffffff __LF \ + stp x12, x13, [P0] __LF \ stp x14, x15, [P0+16] // A version of multiplication that only guarantees output < 2 * p_25519. // This basically skips the +1 and final correction in quotient estimation. #define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - mul x12, x3, x7; \ - umulh x13, x3, x7; \ - mul x11, x3, x8; \ - umulh x14, x3, x8; \ - adds x13, x13, x11; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x9; \ - umulh x15, x3, x9; \ - adcs x14, x14, x11; \ - mul x11, x3, x10; \ - umulh x16, x3, x10; \ - adcs x15, x15, x11; \ - adc x16, x16, xzr; \ - ldp x5, x6, [P1+16]; \ - mul x11, x4, x7; \ - adds x13, x13, x11; \ - mul x11, x4, x8; \ - adcs x14, x14, x11; \ - mul x11, x4, x9; \ - adcs x15, x15, x11; \ - mul x11, x4, x10; \ - adcs x16, x16, x11; \ - umulh x3, x4, x10; \ - adc x3, x3, xzr; \ - umulh x11, x4, x7; \ - adds x14, x14, x11; \ - umulh x11, x4, x8; \ - adcs x15, x15, x11; \ - umulh x11, x4, x9; \ - adcs x16, x16, x11; \ - adc x3, x3, xzr; \ - mul x11, x5, x7; \ - adds x14, x14, x11; \ - mul x11, x5, x8; \ - adcs x15, x15, x11; \ - mul x11, x5, x9; \ - adcs x16, x16, x11; \ - mul x11, x5, x10; \ - adcs x3, x3, x11; \ - umulh x4, x5, x10; \ - adc x4, x4, xzr; \ - umulh x11, x5, x7; \ - adds x15, x15, x11; \ - umulh x11, x5, x8; \ - adcs x16, x16, x11; \ - umulh x11, x5, x9; \ - adcs x3, x3, x11; \ - adc x4, x4, xzr; \ - mul x11, x6, x7; \ - adds x15, x15, x11; \ - mul x11, x6, x8; \ - adcs x16, x16, x11; \ - mul x11, x6, x9; \ - adcs x3, x3, x11; \ - mul x11, x6, x10; \ - adcs x4, x4, x11; \ - umulh x5, x6, x10; \ - adc x5, x5, xzr; \ - umulh x11, x6, x7; \ - adds x16, x16, x11; \ - umulh x11, x6, x8; \ - adcs x3, x3, x11; \ - umulh x11, x6, x9; \ - adcs x4, x4, x11; \ - adc x5, x5, xzr; \ - mov x7, #0x26; \ - mul x11, x7, x16; \ - umulh x9, x7, x16; \ - adds x12, x12, x11; \ - mul x11, x7, x3; \ - umulh x3, x7, x3; \ - adcs x13, x13, x11; \ - mul x11, x7, x4; \ - umulh x4, x7, x4; \ - adcs x14, x14, x11; \ - mul x11, x7, x5; \ - umulh x5, x7, x5; \ - adcs x15, x15, x11; \ - cset x16, cs; \ - adds x15, x15, x4; \ - adc x16, x16, x5; \ - cmn x15, x15; \ - bic x15, x15, #0x8000000000000000; \ - adc x8, x16, x16; \ - mov x7, #0x13; \ - mul x11, x7, x8; \ - adds x12, x12, x11; \ - adcs x13, x13, x9; \ - adcs x14, x14, x3; \ - adc x15, x15, xzr; \ - stp x12, x13, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + bic x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + mul x11, x7, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adc x15, x15, xzr __LF \ + stp x12, x13, [P0] __LF \ stp x14, x15, [P0+16] // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 #define sub_twice4(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [P0]; \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ stp x7, x8, [P0+16] // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. @@ -313,37 +313,37 @@ // at least one of them is reduced double modulo. #define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] #define double_twice4(P0,P1) \ - ldp x3, x4, [P1]; \ - adds x3, x3, x3; \ - adcs x4, x4, x4; \ - ldp x5, x6, [P1+16]; \ - adcs x5, x5, x5; \ - adcs x6, x6, x6; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + adds x3, x3, x3 __LF \ + adcs x4, x4, x4 __LF \ + ldp x5, x6, [P1+16] __LF \ + adcs x5, x5, x5 __LF \ + adcs x6, x6, x6 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] S2N_BN_SYMBOL(edwards25519_scalarmulbase_alt): diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmuldouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmuldouble.S similarity index 73% rename from third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmuldouble.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmuldouble.S index d8c6e21c6e3..3d51f22d3d2 100644 --- a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmuldouble.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmuldouble.S @@ -99,371 +99,371 @@ // Load 64-bit immediate into a register #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 // Macro wrapping up the basic field operation bignum_mul_p25519, only // trivially different from a pure function call to that subroutine. #define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umaddl x5, w5, w0, x5; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - mov x3, #0x13; \ - tst x10, #0x8000000000000000; \ - csel x3, x3, xzr, pl; \ - subs x7, x7, x3; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - and x10, x10, #0x7fffffffffffffff; \ - stp x7, x8, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umaddl x5, w5, w0, x5 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + mov x3, #0x13 __LF \ + tst x10, #0x8000000000000000 __LF \ + csel x3, x3, xzr, pl __LF \ + subs x7, x7, x3 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + and x10, x10, #0x7fffffffffffffff __LF \ + stp x7, x8, [P0] __LF \ stp x9, x10, [P0+16] // A version of multiplication that only guarantees output < 2 * p_25519. // This basically skips the +1 and final correction in quotient estimation. #define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umull x5, w5, w0; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - stp x7, x8, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + umull x7, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x8, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x7, x7, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x8, x8, x15 __LF \ + adds x7, x7, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x8, x8, x16 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + umull x11, w3, w5 __LF \ + lsr x0, x3, #32 __LF \ + umull x15, w0, w5 __LF \ + lsr x16, x5, #32 __LF \ + umull x12, w16, w0 __LF \ + umull x16, w3, w16 __LF \ + adds x11, x11, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x12, x12, x15 __LF \ + adds x11, x11, x16, lsl #32 __LF \ + lsr x16, x16, #32 __LF \ + adc x12, x12, x16 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x16, cc __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, cc __LF \ + cinv x16, x16, cc __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, cc __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, cc __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, cc __LF \ + csetm x9, cc __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, cc __LF \ + cinv x9, x9, cc __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #0x1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #0x1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x3, #0x26 __LF \ + umull x4, w11, w3 __LF \ + add x4, x4, w7, uxtw __LF \ + lsr x7, x7, #32 __LF \ + lsr x11, x11, #32 __LF \ + umaddl x11, w11, w3, x7 __LF \ + mov x7, x4 __LF \ + umull x4, w12, w3 __LF \ + add x4, x4, w8, uxtw __LF \ + lsr x8, x8, #32 __LF \ + lsr x12, x12, #32 __LF \ + umaddl x12, w12, w3, x8 __LF \ + mov x8, x4 __LF \ + umull x4, w13, w3 __LF \ + add x4, x4, w9, uxtw __LF \ + lsr x9, x9, #32 __LF \ + lsr x13, x13, #32 __LF \ + umaddl x13, w13, w3, x9 __LF \ + mov x9, x4 __LF \ + umull x4, w14, w3 __LF \ + add x4, x4, w10, uxtw __LF \ + lsr x10, x10, #32 __LF \ + lsr x14, x14, #32 __LF \ + umaddl x14, w14, w3, x10 __LF \ + mov x10, x4 __LF \ + lsr x0, x14, #31 __LF \ + mov x5, #0x13 __LF \ + umull x5, w5, w0 __LF \ + add x7, x7, x5 __LF \ + adds x7, x7, x11, lsl #32 __LF \ + extr x3, x12, x11, #32 __LF \ + adcs x8, x8, x3 __LF \ + extr x3, x13, x12, #32 __LF \ + adcs x9, x9, x3 __LF \ + extr x3, x14, x13, #32 __LF \ + lsl x5, x0, #63 __LF \ + eor x10, x10, x5 __LF \ + adc x10, x10, x3 __LF \ + stp x7, x8, [P0] __LF \ stp x9, x10, [P0+16] // Squaring just giving a result < 2 * p_25519, which is done by @@ -471,155 +471,155 @@ // optional correction. #define sqr_4(P0,P1) \ - ldp x10, x11, [P1]; \ - ldp x12, x13, [P1+16]; \ - umull x2, w10, w10; \ - lsr x14, x10, #32; \ - umull x3, w14, w14; \ - umull x14, w10, w14; \ - adds x2, x2, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x3, x3, x14; \ - umull x4, w11, w11; \ - lsr x14, x11, #32; \ - umull x5, w14, w14; \ - umull x14, w11, w14; \ - mul x15, x10, x11; \ - umulh x16, x10, x11; \ - adds x4, x4, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x5, x5, x14; \ - adds x15, x15, x15; \ - adcs x16, x16, x16; \ - adc x5, x5, xzr; \ - adds x3, x3, x15; \ - adcs x4, x4, x16; \ - adc x5, x5, xzr; \ - umull x6, w12, w12; \ - lsr x14, x12, #32; \ - umull x7, w14, w14; \ - umull x14, w12, w14; \ - adds x6, x6, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x7, x7, x14; \ - umull x8, w13, w13; \ - lsr x14, x13, #32; \ - umull x9, w14, w14; \ - umull x14, w13, w14; \ - mul x15, x12, x13; \ - umulh x16, x12, x13; \ - adds x8, x8, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x9, x9, x14; \ - adds x15, x15, x15; \ - adcs x16, x16, x16; \ - adc x9, x9, xzr; \ - adds x7, x7, x15; \ - adcs x8, x8, x16; \ - adc x9, x9, xzr; \ - subs x10, x10, x12; \ - sbcs x11, x11, x13; \ - csetm x16, cc; \ - eor x10, x10, x16; \ - subs x10, x10, x16; \ - eor x11, x11, x16; \ - sbc x11, x11, x16; \ - adds x6, x6, x4; \ - adcs x7, x7, x5; \ - adcs x8, x8, xzr; \ - adc x9, x9, xzr; \ - umull x12, w10, w10; \ - lsr x5, x10, #32; \ - umull x13, w5, w5; \ - umull x5, w10, w5; \ - adds x12, x12, x5, lsl #33; \ - lsr x5, x5, #31; \ - adc x13, x13, x5; \ - umull x15, w11, w11; \ - lsr x5, x11, #32; \ - umull x14, w5, w5; \ - umull x5, w11, w5; \ - mul x4, x10, x11; \ - umulh x16, x10, x11; \ - adds x15, x15, x5, lsl #33; \ - lsr x5, x5, #31; \ - adc x14, x14, x5; \ - adds x4, x4, x4; \ - adcs x16, x16, x16; \ - adc x14, x14, xzr; \ - adds x13, x13, x4; \ - adcs x15, x15, x16; \ - adc x14, x14, xzr; \ - adds x4, x2, x6; \ - adcs x5, x3, x7; \ - adcs x6, x6, x8; \ - adcs x7, x7, x9; \ - csetm x16, cc; \ - subs x4, x4, x12; \ - sbcs x5, x5, x13; \ - sbcs x6, x6, x15; \ - sbcs x7, x7, x14; \ - adcs x8, x8, x16; \ - adc x9, x9, x16; \ - mov x10, #0x26; \ - umull x12, w6, w10; \ - add x12, x12, w2, uxtw; \ - lsr x2, x2, #32; \ - lsr x6, x6, #32; \ - umaddl x6, w6, w10, x2; \ - mov x2, x12; \ - umull x12, w7, w10; \ - add x12, x12, w3, uxtw; \ - lsr x3, x3, #32; \ - lsr x7, x7, #32; \ - umaddl x7, w7, w10, x3; \ - mov x3, x12; \ - umull x12, w8, w10; \ - add x12, x12, w4, uxtw; \ - lsr x4, x4, #32; \ - lsr x8, x8, #32; \ - umaddl x8, w8, w10, x4; \ - mov x4, x12; \ - umull x12, w9, w10; \ - add x12, x12, w5, uxtw; \ - lsr x5, x5, #32; \ - lsr x9, x9, #32; \ - umaddl x9, w9, w10, x5; \ - mov x5, x12; \ - lsr x13, x9, #31; \ - mov x11, #0x13; \ - umull x11, w11, w13; \ - add x2, x2, x11; \ - adds x2, x2, x6, lsl #32; \ - extr x10, x7, x6, #32; \ - adcs x3, x3, x10; \ - extr x10, x8, x7, #32; \ - adcs x4, x4, x10; \ - extr x10, x9, x8, #32; \ - lsl x11, x13, #63; \ - eor x5, x5, x11; \ - adc x5, x5, x10; \ - stp x2, x3, [P0]; \ + ldp x10, x11, [P1] __LF \ + ldp x12, x13, [P1+16] __LF \ + umull x2, w10, w10 __LF \ + lsr x14, x10, #32 __LF \ + umull x3, w14, w14 __LF \ + umull x14, w10, w14 __LF \ + adds x2, x2, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x3, x3, x14 __LF \ + umull x4, w11, w11 __LF \ + lsr x14, x11, #32 __LF \ + umull x5, w14, w14 __LF \ + umull x14, w11, w14 __LF \ + mul x15, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x4, x4, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x5, x5, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x5, x5, xzr __LF \ + adds x3, x3, x15 __LF \ + adcs x4, x4, x16 __LF \ + adc x5, x5, xzr __LF \ + umull x6, w12, w12 __LF \ + lsr x14, x12, #32 __LF \ + umull x7, w14, w14 __LF \ + umull x14, w12, w14 __LF \ + adds x6, x6, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x7, x7, x14 __LF \ + umull x8, w13, w13 __LF \ + lsr x14, x13, #32 __LF \ + umull x9, w14, w14 __LF \ + umull x14, w13, w14 __LF \ + mul x15, x12, x13 __LF \ + umulh x16, x12, x13 __LF \ + adds x8, x8, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x9, x9, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x9, x9, xzr __LF \ + adds x7, x7, x15 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, xzr __LF \ + subs x10, x10, x12 __LF \ + sbcs x11, x11, x13 __LF \ + csetm x16, cc __LF \ + eor x10, x10, x16 __LF \ + subs x10, x10, x16 __LF \ + eor x11, x11, x16 __LF \ + sbc x11, x11, x16 __LF \ + adds x6, x6, x4 __LF \ + adcs x7, x7, x5 __LF \ + adcs x8, x8, xzr __LF \ + adc x9, x9, xzr __LF \ + umull x12, w10, w10 __LF \ + lsr x5, x10, #32 __LF \ + umull x13, w5, w5 __LF \ + umull x5, w10, w5 __LF \ + adds x12, x12, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x13, x13, x5 __LF \ + umull x15, w11, w11 __LF \ + lsr x5, x11, #32 __LF \ + umull x14, w5, w5 __LF \ + umull x5, w11, w5 __LF \ + mul x4, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x15, x15, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x14, x14, x5 __LF \ + adds x4, x4, x4 __LF \ + adcs x16, x16, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x13, x13, x4 __LF \ + adcs x15, x15, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x4, x2, x6 __LF \ + adcs x5, x3, x7 __LF \ + adcs x6, x6, x8 __LF \ + adcs x7, x7, x9 __LF \ + csetm x16, cc __LF \ + subs x4, x4, x12 __LF \ + sbcs x5, x5, x13 __LF \ + sbcs x6, x6, x15 __LF \ + sbcs x7, x7, x14 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, x16 __LF \ + mov x10, #0x26 __LF \ + umull x12, w6, w10 __LF \ + add x12, x12, w2, uxtw __LF \ + lsr x2, x2, #32 __LF \ + lsr x6, x6, #32 __LF \ + umaddl x6, w6, w10, x2 __LF \ + mov x2, x12 __LF \ + umull x12, w7, w10 __LF \ + add x12, x12, w3, uxtw __LF \ + lsr x3, x3, #32 __LF \ + lsr x7, x7, #32 __LF \ + umaddl x7, w7, w10, x3 __LF \ + mov x3, x12 __LF \ + umull x12, w8, w10 __LF \ + add x12, x12, w4, uxtw __LF \ + lsr x4, x4, #32 __LF \ + lsr x8, x8, #32 __LF \ + umaddl x8, w8, w10, x4 __LF \ + mov x4, x12 __LF \ + umull x12, w9, w10 __LF \ + add x12, x12, w5, uxtw __LF \ + lsr x5, x5, #32 __LF \ + lsr x9, x9, #32 __LF \ + umaddl x9, w9, w10, x5 __LF \ + mov x5, x12 __LF \ + lsr x13, x9, #31 __LF \ + mov x11, #0x13 __LF \ + umull x11, w11, w13 __LF \ + add x2, x2, x11 __LF \ + adds x2, x2, x6, lsl #32 __LF \ + extr x10, x7, x6, #32 __LF \ + adcs x3, x3, x10 __LF \ + extr x10, x8, x7, #32 __LF \ + adcs x4, x4, x10 __LF \ + extr x10, x9, x8, #32 __LF \ + lsl x11, x13, #63 __LF \ + eor x5, x5, x11 __LF \ + adc x5, x5, x10 __LF \ + stp x2, x3, [P0] __LF \ stp x4, x5, [P0+16] // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 #define sub_twice4(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [P0]; \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ stp x7, x8, [P0+16] // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. @@ -629,59 +629,59 @@ // at least one of them is reduced double modulo. #define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] #define double_twice4(P0,P1) \ - ldp x3, x4, [P1]; \ - adds x3, x3, x3; \ - adcs x4, x4, x4; \ - ldp x5, x6, [P1+16]; \ - adcs x5, x5, x5; \ - adcs x6, x6, x6; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + adds x3, x3, x3 __LF \ + adcs x4, x4, x4 __LF \ + ldp x5, x6, [P1+16] __LF \ + adcs x5, x5, x5 __LF \ + adcs x6, x6, x6 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] // Load the constant k_25519 = 2 * d_25519 using immediate operations #define load_k25519(P0) \ - movz x0, #0xf159; \ - movz x1, #0xb156; \ - movz x2, #0xd130; \ - movz x3, #0xfce7; \ - movk x0, #0x26b2, lsl #16; \ - movk x1, #0x8283, lsl #16; \ - movk x2, #0xeef3, lsl #16; \ - movk x3, #0x56df, lsl #16; \ - movk x0, #0x9b94, lsl #32; \ - movk x1, #0x149a, lsl #32; \ - movk x2, #0x80f2, lsl #32; \ - movk x3, #0xd9dc, lsl #32; \ - movk x0, #0xebd6, lsl #48; \ - movk x1, #0x00e0, lsl #48; \ - movk x2, #0x198e, lsl #48; \ - movk x3, #0x2406, lsl #48; \ - stp x0, x1, [P0]; \ + movz x0, #0xf159 __LF \ + movz x1, #0xb156 __LF \ + movz x2, #0xd130 __LF \ + movz x3, #0xfce7 __LF \ + movk x0, #0x26b2, lsl #16 __LF \ + movk x1, #0x8283, lsl #16 __LF \ + movk x2, #0xeef3, lsl #16 __LF \ + movk x3, #0x56df, lsl #16 __LF \ + movk x0, #0x9b94, lsl #32 __LF \ + movk x1, #0x149a, lsl #32 __LF \ + movk x2, #0x80f2, lsl #32 __LF \ + movk x3, #0xd9dc, lsl #32 __LF \ + movk x0, #0xebd6, lsl #48 __LF \ + movk x1, #0x00e0, lsl #48 __LF \ + movk x2, #0x198e, lsl #48 __LF \ + movk x3, #0x2406, lsl #48 __LF \ + stp x0, x1, [P0] __LF \ stp x2, x3, [P0+16] S2N_BN_SYMBOL(edwards25519_scalarmuldouble): diff --git a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmuldouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmuldouble_alt.S similarity index 81% rename from third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmuldouble_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmuldouble_alt.S index 9c3d6db2cb5..6df13a937bd 100644 --- a/third_party/s2n-bignum/arm/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/curve25519/edwards25519_scalarmuldouble_alt.S @@ -99,213 +99,213 @@ // Load 64-bit immediate into a register #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 // Macro wrapping up the basic field operation bignum_mul_p25519_alt, only // trivially different from a pure function call to that subroutine. #define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - mul x12, x3, x7; \ - umulh x13, x3, x7; \ - mul x11, x3, x8; \ - umulh x14, x3, x8; \ - adds x13, x13, x11; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x9; \ - umulh x15, x3, x9; \ - adcs x14, x14, x11; \ - mul x11, x3, x10; \ - umulh x16, x3, x10; \ - adcs x15, x15, x11; \ - adc x16, x16, xzr; \ - ldp x5, x6, [P1+16]; \ - mul x11, x4, x7; \ - adds x13, x13, x11; \ - mul x11, x4, x8; \ - adcs x14, x14, x11; \ - mul x11, x4, x9; \ - adcs x15, x15, x11; \ - mul x11, x4, x10; \ - adcs x16, x16, x11; \ - umulh x3, x4, x10; \ - adc x3, x3, xzr; \ - umulh x11, x4, x7; \ - adds x14, x14, x11; \ - umulh x11, x4, x8; \ - adcs x15, x15, x11; \ - umulh x11, x4, x9; \ - adcs x16, x16, x11; \ - adc x3, x3, xzr; \ - mul x11, x5, x7; \ - adds x14, x14, x11; \ - mul x11, x5, x8; \ - adcs x15, x15, x11; \ - mul x11, x5, x9; \ - adcs x16, x16, x11; \ - mul x11, x5, x10; \ - adcs x3, x3, x11; \ - umulh x4, x5, x10; \ - adc x4, x4, xzr; \ - umulh x11, x5, x7; \ - adds x15, x15, x11; \ - umulh x11, x5, x8; \ - adcs x16, x16, x11; \ - umulh x11, x5, x9; \ - adcs x3, x3, x11; \ - adc x4, x4, xzr; \ - mul x11, x6, x7; \ - adds x15, x15, x11; \ - mul x11, x6, x8; \ - adcs x16, x16, x11; \ - mul x11, x6, x9; \ - adcs x3, x3, x11; \ - mul x11, x6, x10; \ - adcs x4, x4, x11; \ - umulh x5, x6, x10; \ - adc x5, x5, xzr; \ - umulh x11, x6, x7; \ - adds x16, x16, x11; \ - umulh x11, x6, x8; \ - adcs x3, x3, x11; \ - umulh x11, x6, x9; \ - adcs x4, x4, x11; \ - adc x5, x5, xzr; \ - mov x7, #0x26; \ - mul x11, x7, x16; \ - umulh x9, x7, x16; \ - adds x12, x12, x11; \ - mul x11, x7, x3; \ - umulh x3, x7, x3; \ - adcs x13, x13, x11; \ - mul x11, x7, x4; \ - umulh x4, x7, x4; \ - adcs x14, x14, x11; \ - mul x11, x7, x5; \ - umulh x5, x7, x5; \ - adcs x15, x15, x11; \ - cset x16, cs; \ - adds x15, x15, x4; \ - adc x16, x16, x5; \ - cmn x15, x15; \ - orr x15, x15, #0x8000000000000000; \ - adc x8, x16, x16; \ - mov x7, #0x13; \ - madd x11, x7, x8, x7; \ - adds x12, x12, x11; \ - adcs x13, x13, x9; \ - adcs x14, x14, x3; \ - adcs x15, x15, xzr; \ - csel x7, x7, xzr, cc; \ - subs x12, x12, x7; \ - sbcs x13, x13, xzr; \ - sbcs x14, x14, xzr; \ - sbc x15, x15, xzr; \ - and x15, x15, #0x7fffffffffffffff; \ - stp x12, x13, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + orr x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + madd x11, x7, x8, x7 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x15, x15, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + and x15, x15, #0x7fffffffffffffff __LF \ + stp x12, x13, [P0] __LF \ stp x14, x15, [P0+16] // A version of multiplication that only guarantees output < 2 * p_25519. // This basically skips the +1 and final correction in quotient estimation. #define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - mul x12, x3, x7; \ - umulh x13, x3, x7; \ - mul x11, x3, x8; \ - umulh x14, x3, x8; \ - adds x13, x13, x11; \ - ldp x9, x10, [P2+16]; \ - mul x11, x3, x9; \ - umulh x15, x3, x9; \ - adcs x14, x14, x11; \ - mul x11, x3, x10; \ - umulh x16, x3, x10; \ - adcs x15, x15, x11; \ - adc x16, x16, xzr; \ - ldp x5, x6, [P1+16]; \ - mul x11, x4, x7; \ - adds x13, x13, x11; \ - mul x11, x4, x8; \ - adcs x14, x14, x11; \ - mul x11, x4, x9; \ - adcs x15, x15, x11; \ - mul x11, x4, x10; \ - adcs x16, x16, x11; \ - umulh x3, x4, x10; \ - adc x3, x3, xzr; \ - umulh x11, x4, x7; \ - adds x14, x14, x11; \ - umulh x11, x4, x8; \ - adcs x15, x15, x11; \ - umulh x11, x4, x9; \ - adcs x16, x16, x11; \ - adc x3, x3, xzr; \ - mul x11, x5, x7; \ - adds x14, x14, x11; \ - mul x11, x5, x8; \ - adcs x15, x15, x11; \ - mul x11, x5, x9; \ - adcs x16, x16, x11; \ - mul x11, x5, x10; \ - adcs x3, x3, x11; \ - umulh x4, x5, x10; \ - adc x4, x4, xzr; \ - umulh x11, x5, x7; \ - adds x15, x15, x11; \ - umulh x11, x5, x8; \ - adcs x16, x16, x11; \ - umulh x11, x5, x9; \ - adcs x3, x3, x11; \ - adc x4, x4, xzr; \ - mul x11, x6, x7; \ - adds x15, x15, x11; \ - mul x11, x6, x8; \ - adcs x16, x16, x11; \ - mul x11, x6, x9; \ - adcs x3, x3, x11; \ - mul x11, x6, x10; \ - adcs x4, x4, x11; \ - umulh x5, x6, x10; \ - adc x5, x5, xzr; \ - umulh x11, x6, x7; \ - adds x16, x16, x11; \ - umulh x11, x6, x8; \ - adcs x3, x3, x11; \ - umulh x11, x6, x9; \ - adcs x4, x4, x11; \ - adc x5, x5, xzr; \ - mov x7, #0x26; \ - mul x11, x7, x16; \ - umulh x9, x7, x16; \ - adds x12, x12, x11; \ - mul x11, x7, x3; \ - umulh x3, x7, x3; \ - adcs x13, x13, x11; \ - mul x11, x7, x4; \ - umulh x4, x7, x4; \ - adcs x14, x14, x11; \ - mul x11, x7, x5; \ - umulh x5, x7, x5; \ - adcs x15, x15, x11; \ - cset x16, cs; \ - adds x15, x15, x4; \ - adc x16, x16, x5; \ - cmn x15, x15; \ - bic x15, x15, #0x8000000000000000; \ - adc x8, x16, x16; \ - mov x7, #0x13; \ - mul x11, x7, x8; \ - adds x12, x12, x11; \ - adcs x13, x13, x9; \ - adcs x14, x14, x3; \ - adc x15, x15, xzr; \ - stp x12, x13, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x15, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x16, x3, x10 __LF \ + adcs x15, x15, x11 __LF \ + adc x16, x16, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x16, x16, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x16, x16, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x26 __LF \ + mul x11, x7, x16 __LF \ + umulh x9, x7, x16 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x15, x15, x11 __LF \ + cset x16, cs __LF \ + adds x15, x15, x4 __LF \ + adc x16, x16, x5 __LF \ + cmn x15, x15 __LF \ + bic x15, x15, #0x8000000000000000 __LF \ + adc x8, x16, x16 __LF \ + mov x7, #0x13 __LF \ + mul x11, x7, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adc x15, x15, xzr __LF \ + stp x12, x13, [P0] __LF \ stp x14, x15, [P0+16] // Squaring just giving a result < 2 * p_25519, which is done by @@ -313,97 +313,97 @@ // optional correction. #define sqr_4(P0,P1) \ - ldp x2, x3, [P1]; \ - mul x9, x2, x3; \ - umulh x10, x2, x3; \ - ldp x4, x5, [P1+16]; \ - mul x11, x2, x5; \ - umulh x12, x2, x5; \ - mul x7, x2, x4; \ - umulh x6, x2, x4; \ - adds x10, x10, x7; \ - adcs x11, x11, x6; \ - mul x7, x3, x4; \ - umulh x6, x3, x4; \ - adc x6, x6, xzr; \ - adds x11, x11, x7; \ - mul x13, x4, x5; \ - umulh x14, x4, x5; \ - adcs x12, x12, x6; \ - mul x7, x3, x5; \ - umulh x6, x3, x5; \ - adc x6, x6, xzr; \ - adds x12, x12, x7; \ - adcs x13, x13, x6; \ - adc x14, x14, xzr; \ - adds x9, x9, x9; \ - adcs x10, x10, x10; \ - adcs x11, x11, x11; \ - adcs x12, x12, x12; \ - adcs x13, x13, x13; \ - adcs x14, x14, x14; \ - cset x6, cs; \ - umulh x7, x2, x2; \ - mul x8, x2, x2; \ - adds x9, x9, x7; \ - mul x7, x3, x3; \ - adcs x10, x10, x7; \ - umulh x7, x3, x3; \ - adcs x11, x11, x7; \ - mul x7, x4, x4; \ - adcs x12, x12, x7; \ - umulh x7, x4, x4; \ - adcs x13, x13, x7; \ - mul x7, x5, x5; \ - adcs x14, x14, x7; \ - umulh x7, x5, x5; \ - adc x6, x6, x7; \ - mov x3, #0x26; \ - mul x7, x3, x12; \ - umulh x4, x3, x12; \ - adds x8, x8, x7; \ - mul x7, x3, x13; \ - umulh x13, x3, x13; \ - adcs x9, x9, x7; \ - mul x7, x3, x14; \ - umulh x14, x3, x14; \ - adcs x10, x10, x7; \ - mul x7, x3, x6; \ - umulh x6, x3, x6; \ - adcs x11, x11, x7; \ - cset x12, cs; \ - adds x11, x11, x14; \ - adc x12, x12, x6; \ - cmn x11, x11; \ - bic x11, x11, #0x8000000000000000; \ - adc x2, x12, x12; \ - mov x3, #0x13; \ - mul x7, x3, x2; \ - adds x8, x8, x7; \ - adcs x9, x9, x4; \ - adcs x10, x10, x13; \ - adc x11, x11, xzr; \ - stp x8, x9, [P0]; \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mov x3, #0x26 __LF \ + mul x7, x3, x12 __LF \ + umulh x4, x3, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, x3, x13 __LF \ + umulh x13, x3, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, x3, x14 __LF \ + umulh x14, x3, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, x3, x6 __LF \ + umulh x6, x3, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + cmn x11, x11 __LF \ + bic x11, x11, #0x8000000000000000 __LF \ + adc x2, x12, x12 __LF \ + mov x3, #0x13 __LF \ + mul x7, x3, x2 __LF \ + adds x8, x8, x7 __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adc x11, x11, xzr __LF \ + stp x8, x9, [P0] __LF \ stp x10, x11, [P0+16] // Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 #define sub_twice4(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [P0]; \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #38 __LF \ + csel x3, x4, xzr, lo __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ stp x7, x8, [P0+16] // Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. @@ -413,59 +413,59 @@ // at least one of them is reduced double modulo. #define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x8 __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2+16] __LF \ + adcs x5, x5, x7 __LF \ + adcs x6, x6, x8 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] #define double_twice4(P0,P1) \ - ldp x3, x4, [P1]; \ - adds x3, x3, x3; \ - adcs x4, x4, x4; \ - ldp x5, x6, [P1+16]; \ - adcs x5, x5, x5; \ - adcs x6, x6, x6; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ + ldp x3, x4, [P1] __LF \ + adds x3, x3, x3 __LF \ + adcs x4, x4, x4 __LF \ + ldp x5, x6, [P1+16] __LF \ + adcs x5, x5, x5 __LF \ + adcs x6, x6, x6 __LF \ + mov x9, #38 __LF \ + csel x9, x9, xzr, cs __LF \ + adds x3, x3, x9 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adc x6, x6, xzr __LF \ + stp x3, x4, [P0] __LF \ stp x5, x6, [P0+16] // Load the constant k_25519 = 2 * d_25519 using immediate operations #define load_k25519(P0) \ - movz x0, #0xf159; \ - movz x1, #0xb156; \ - movz x2, #0xd130; \ - movz x3, #0xfce7; \ - movk x0, #0x26b2, lsl #16; \ - movk x1, #0x8283, lsl #16; \ - movk x2, #0xeef3, lsl #16; \ - movk x3, #0x56df, lsl #16; \ - movk x0, #0x9b94, lsl #32; \ - movk x1, #0x149a, lsl #32; \ - movk x2, #0x80f2, lsl #32; \ - movk x3, #0xd9dc, lsl #32; \ - movk x0, #0xebd6, lsl #48; \ - movk x1, #0x00e0, lsl #48; \ - movk x2, #0x198e, lsl #48; \ - movk x3, #0x2406, lsl #48; \ - stp x0, x1, [P0]; \ + movz x0, #0xf159 __LF \ + movz x1, #0xb156 __LF \ + movz x2, #0xd130 __LF \ + movz x3, #0xfce7 __LF \ + movk x0, #0x26b2, lsl #16 __LF \ + movk x1, #0x8283, lsl #16 __LF \ + movk x2, #0xeef3, lsl #16 __LF \ + movk x3, #0x56df, lsl #16 __LF \ + movk x0, #0x9b94, lsl #32 __LF \ + movk x1, #0x149a, lsl #32 __LF \ + movk x2, #0x80f2, lsl #32 __LF \ + movk x3, #0xd9dc, lsl #32 __LF \ + movk x0, #0xebd6, lsl #48 __LF \ + movk x1, #0x00e0, lsl #48 __LF \ + movk x2, #0x198e, lsl #48 __LF \ + movk x3, #0x2406, lsl #48 __LF \ + stp x0, x1, [P0] __LF \ stp x2, x3, [P0+16] S2N_BN_SYMBOL(edwards25519_scalarmuldouble_alt): diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/Makefile new file mode 100644 index 00000000000..10f922e94dc --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/Makefile @@ -0,0 +1,47 @@ +############################################################################# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +############################################################################# + +# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise +# use a cross-assembling version so that the code can still be assembled +# and the proofs checked against the object files (though you won't be able +# to run code without additional emulation infrastructure). The aarch64 +# cross-assembling version can be installed manually by something like: +# +# sudo apt-get install binutils-aarch64-linux-gnu + +UNAME_RESULT=$(shell uname -p) + +ifeq ($(UNAME_RESULT),aarch64) +GAS=as +else +GAS=aarch64-linux-gnu-as +endif + +# List of object files + +OBJ = bignum_emontredc_8n.o \ + bignum_emontredc_8n_cdiff.o \ + bignum_kmul_16_32.o \ + bignum_kmul_32_64.o \ + bignum_ksqr_16_32.o \ + bignum_ksqr_32_64.o \ + bignum_mul_4_8.o \ + bignum_mul_4_8_alt.o \ + bignum_mul_6_12.o \ + bignum_mul_6_12_alt.o \ + bignum_mul_8_16.o \ + bignum_mul_8_16_alt.o \ + bignum_sqr_4_8.o \ + bignum_sqr_4_8_alt.o \ + bignum_sqr_6_12.o \ + bignum_sqr_6_12_alt.o \ + bignum_sqr_8_16.o \ + bignum_sqr_8_16_alt.o \ + +%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - + +default: $(OBJ); + +clean:; rm -f *.o *.correct diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_emontredc_8n.S similarity index 72% rename from third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n_neon.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_emontredc_8n.S index 3e72ebd67fa..19dc363f13c 100644 --- a/third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n_neon.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_emontredc_8n.S @@ -5,23 +5,25 @@ // Extended Montgomery reduce in 8-digit blocks, results in input-output buffer // Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] // -// extern uint64_t bignum_emontredc_8n_neon +// extern uint64_t bignum_emontredc_8n // (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w); // // Functionally equivalent to bignum_emontredc (see that file for more detail). // But in general assumes that the input k is a multiple of 8. +// bignum_emontredc_8n is a vectorized version of +// unopt/bignum_emontredc_8n_base. // // Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = w, returns X0 // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n_neon) - .text - .balign 4 + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n) + .text + .balign 4 -S2N_BN_SYMBOL(bignum_emontredc_8n_neon): +S2N_BN_SYMBOL(bignum_emontredc_8n): stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! @@ -31,14 +33,14 @@ S2N_BN_SYMBOL(bignum_emontredc_8n_neon): lsr x0, x0, #2 mov x26, x0 subs x12, x0, #1 - bcc bignum_emontredc_8n_neon_end + bcc bignum_emontredc_8n_end stp x3, xzr, [sp] stp x26, xzr, [sp, #16] mov x28, xzr lsl x0, x12, #5 -bignum_emontredc_8n_neon_outerloop: +bignum_emontredc_8n_outerloop: ldp x3, xzr, [sp] ldp x17, x19, [x1] ldp x20, x21, [x1, #16] @@ -54,10 +56,10 @@ dup v0.2d, x4 uzp2 v3.4s, v21.4s, v0.4s xtn v4.2s, v0.2d xtn v5.2s, v21.2d - mul x12, x4, x8 + mul x12, x4, x8 adds x17, x17, x12 umulh x12, x4, x8 - mul x13, x4, x9 + mul x13, x4, x9 rev64 v1.4s, v21.4s umull v6.2d, v4.2s, v5.2s umull v7.2d, v4.2s, v3.2s @@ -96,10 +98,10 @@ uzp2 v3.4s, v21.4s, v0.4s xtn v4.2s, v0.2d xtn v5.2s, v21.2d - mul x12, x5, x8 - adds x19, x19, x12 - umulh x12, x5, x8 - mul x13, x5, x9 + mul x12, x5, x8 + adds x19, x19, x12 + umulh x12, x5, x8 + mul x13, x5, x9 rev64 v1.4s, v21.4s umull v6.2d, v4.2s, v5.2s @@ -117,8 +119,8 @@ usra v1.2d, v7.2d, #32 umlal v0.2d, v4.2s, v5.2s mov x14, v0.d[0] mov x15, v0.d[1] - adcs x20, x20, x13 - umulh x13, x5, x9 + adcs x20, x20, x13 + umulh x13, x5, x9 adcs x21, x21, x14 usra v1.2d, v2.2d, #32 mov x14, v1.d[0] @@ -126,7 +128,7 @@ mov x14, v1.d[0] mov x15, v1.d[1] adc x23, xzr, xzr adds x20, x20, x12 - mul x6, x20, x3 // hoisted from step 2 + mul x6, x20, x3 // hoisted from step 2 // NEON: For montgomery step 2, // calculate x6 * (x10, x11) that does two 64x64->128-bit multiplications. @@ -145,7 +147,7 @@ xtn v5.2s, in2.2d stp x4, x5, [x1] -// hoisted from maddloop_neon_firstitr +// hoisted from maddloop_firstitr ldr q20, [x1] // q21 will be loaded later. @@ -159,24 +161,24 @@ umull v6.2d, v4.2s, v5.2s umull v7.2d, v4.2s, v3.2s uzp2 v16.4s, in1.4s, in1.4s - mul x12, x6, x8 - adds x20, x20, x12 + mul x12, x6, x8 + adds x20, x20, x12 mul v0.4s, v1.4s, in1.4s movi v2.2d, #0x000000ffffffff usra v7.2d, v6.2d, #32 umull out_hi.2d, v16.2s, v3.2s - umulh x12, x6, x8 - mul x13, x6, x9 + umulh x12, x6, x8 + mul x13, x6, x9 uaddlp v0.2d, v0.4s and v2.16b, v7.16b, v2.16b umlal v2.2d, v16.2s, v5.2s shl out_lo.2d, v0.2d, #32 - adcs x21, x21, x13 - umulh x13, x6, x9 + adcs x21, x21, x13 + umulh x13, x6, x9 usra out_hi.2d, v7.2d, #32 umlal out_lo.2d, v4.2s, v5.2s @@ -195,19 +197,19 @@ usra out_hi.2d, v2.2d, #32 mov x14, v1.d[0] mov x15, v1.d[1] - adc x24, xzr, xzr - adds x21, x21, x12 - mul x7, x21, x3 - adcs x22, x22, x13 - adcs x23, x23, x14 - adc x24, x24, x15 + adc x24, xzr, xzr + adds x21, x21, x12 + mul x7, x21, x3 + adcs x22, x22, x13 + adcs x23, x23, x14 + adc x24, x24, x15 stp x6, x7, [x1, #16] -// hoisted from maddloop_neon_firstitr +// hoisted from maddloop_firstitr ldr q21, [x1, #16] -// pre-calculate 2mul+2umulhs in maddloop_neon_firstitr +// pre-calculate 2mul+2umulhs in maddloop_firstitr // v25++v24 = hi and lo of (x4 * x8, x5 * x9) #define in1 v20 #define in2 v22 @@ -218,16 +220,16 @@ xtn v4.2s, in1.2d // Montgomery step 3 - mul x12, x7, x8 - mul x13, x7, x9 + mul x12, x7, x8 + mul x13, x7, x9 xtn v5.2s, in2.2d rev64 v1.4s, in2.4s umull v6.2d, v4.2s, v5.2s umull v7.2d, v4.2s, v3.2s - mul x14, x7, x10 - mul x15, x7, x11 + mul x14, x7, x10 + mul x15, x7, x11 uzp2 v16.4s, in1.4s, in1.4s mul v0.4s, v1.4s, in1.4s @@ -238,10 +240,10 @@ uaddlp v0.2d, v0.4s and v2.16b, v7.16b, v2.16b umlal v2.2d, v16.2s, v5.2s - adds x21, x21, x12 - umulh x12, x7, x8 - adcs x22, x22, x13 - umulh x13, x7, x9 + adds x21, x21, x12 + umulh x12, x7, x8 + adcs x22, x22, x13 + umulh x13, x7, x9 shl out_lo.2d, v0.2d, #32 usra out_hi.2d, v7.2d, #32 @@ -252,10 +254,10 @@ usra out_hi.2d, v2.2d, #32 #undef out_lo #undef out_hi - adcs x23, x23, x14 - umulh x14, x7, x10 - adcs x24, x24, x15 - umulh x15, x7, x11 + adcs x23, x23, x14 + umulh x14, x7, x10 + adcs x24, x24, x15 + umulh x15, x7, x11 // v27++v26 = hi and lo of (x6 * x10, x7 * x11) #define in1 v21 @@ -267,7 +269,7 @@ xtn v4.2s, in1.2d xtn v5.2s, in2.2d rev64 v1.4s, in2.4s -// hoisted from maddloop_neon_firstitr and maddloop_x0one +// hoisted from maddloop_firstitr and maddloop_x0one ldp x8, x9, [x2, #32] ldp x10, x11, [x2, #48] @@ -276,11 +278,11 @@ umull v7.2d, v4.2s, v3.2s uzp2 v16.4s, in1.4s, in1.4s mul v0.4s, v1.4s, in1.4s - adc x25, xzr, xzr - adds x12, x22, x12 - adcs x13, x23, x13 - adcs x14, x24, x14 - adc x15, x25, x15 + adc x25, xzr, xzr + adds x12, x22, x12 + adcs x13, x23, x13 + adcs x14, x24, x14 + adc x15, x25, x15 movi v2.2d, #0x000000ffffffff usra v7.2d, v6.2d, #32 @@ -297,156 +299,156 @@ usra out_hi.2d, v2.2d, #32 #undef out_lo #undef out_hi - cbz x0, bignum_emontredc_8n_neon_madddone + cbz x0, bignum_emontredc_8n_madddone mov x27, x0 cmp x0, #32 - bne bignum_emontredc_8n_neon_maddloop_neon_firstitr - -bignum_emontredc_8n_neon_maddloop_x0one: - add x2, x2, #0x20 - add x1, x1, #0x20 - mul x17, x4, x8 - mul x22, x5, x9 - mul x23, x6, x10 - mul x24, x7, x11 - umulh x16, x4, x8 - adds x22, x22, x16 - umulh x16, x5, x9 - adcs x23, x23, x16 - umulh x16, x6, x10 - adcs x24, x24, x16 - umulh x16, x7, x11 - adc x25, x16, xzr - ldp x20, x21, [x1] - adds x12, x12, x20 - adcs x13, x13, x21 - ldp x20, x21, [x1, #16] - adcs x14, x14, x20 - adcs x15, x15, x21 - adc x16, xzr, xzr - adds x19, x22, x17 - adcs x22, x23, x22 - adcs x23, x24, x23 - adcs x24, x25, x24 - adc x25, xzr, x25 - adds x20, x22, x17 - adcs x21, x23, x19 - adcs x22, x24, x22 - adcs x23, x25, x23 - adcs x24, xzr, x24 - adc x25, xzr, x25 - adds x17, x17, x12 - adcs x19, x19, x13 - adcs x20, x20, x14 - adcs x21, x21, x15 - adcs x22, x22, x16 - adcs x23, x23, xzr - adcs x24, x24, xzr - adc x25, x25, xzr - subs x15, x6, x7 - cneg x15, x15, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x13, x11, x10 - cneg x13, x13, cc // cc = lo, ul, last - mul x14, x15, x13 - umulh x13, x15, x13 - cinv x12, x12, cc // cc = lo, ul, last - cmn x12, #0x1 - eor x14, x14, x12 - adcs x23, x23, x14 - eor x13, x13, x12 - adcs x24, x24, x13 - adc x25, x25, x12 - subs x15, x4, x5 - cneg x15, x15, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x13, x9, x8 - cneg x13, x13, cc // cc = lo, ul, last - mul x14, x15, x13 - umulh x13, x15, x13 - cinv x12, x12, cc // cc = lo, ul, last - cmn x12, #0x1 - eor x14, x14, x12 - adcs x19, x19, x14 - eor x13, x13, x12 - adcs x20, x20, x13 - adcs x21, x21, x12 - adcs x22, x22, x12 - adcs x23, x23, x12 - adcs x24, x24, x12 - adc x25, x25, x12 - subs x15, x5, x7 - cneg x15, x15, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x13, x11, x9 - cneg x13, x13, cc // cc = lo, ul, last - mul x14, x15, x13 - umulh x13, x15, x13 - cinv x12, x12, cc // cc = lo, ul, last - cmn x12, #0x1 - eor x14, x14, x12 - adcs x22, x22, x14 - eor x13, x13, x12 - adcs x23, x23, x13 - adcs x24, x24, x12 - adc x25, x25, x12 - subs x15, x4, x6 - cneg x15, x15, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x13, x10, x8 - cneg x13, x13, cc // cc = lo, ul, last - mul x14, x15, x13 - umulh x13, x15, x13 - cinv x12, x12, cc // cc = lo, ul, last - cmn x12, #0x1 - eor x14, x14, x12 - adcs x20, x20, x14 - eor x13, x13, x12 - adcs x21, x21, x13 - adcs x22, x22, x12 - adcs x23, x23, x12 - adcs x24, x24, x12 - adc x25, x25, x12 - subs x15, x4, x7 - cneg x15, x15, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x13, x11, x8 - cneg x13, x13, cc // cc = lo, ul, last - mul x14, x15, x13 - umulh x13, x15, x13 - cinv x12, x12, cc // cc = lo, ul, last - cmn x12, #0x1 - eor x14, x14, x12 - adcs x21, x21, x14 - eor x13, x13, x12 - adcs x22, x22, x13 - adcs x23, x23, x12 - adcs x24, x24, x12 - adc x25, x25, x12 - subs x15, x5, x6 - cneg x15, x15, cc // cc = lo, ul, last - csetm x12, cc // cc = lo, ul, last - subs x13, x10, x9 - cneg x13, x13, cc // cc = lo, ul, last - mul x14, x15, x13 - umulh x13, x15, x13 - cinv x12, x12, cc // cc = lo, ul, last - cmn x12, #0x1 - eor x14, x14, x12 - adcs x21, x21, x14 - eor x13, x13, x12 - adcs x22, x22, x13 - adcs x13, x23, x12 - adcs x14, x24, x12 - adc x15, x25, x12 - mov x12, x22 - stp x17, x19, [x1] - stp x20, x21, [x1, #16] - sub x27, x27, #0x20 - b bignum_emontredc_8n_neon_madddone - - -bignum_emontredc_8n_neon_maddloop_neon_firstitr: + bne bignum_emontredc_8n_maddloop_firstitr + +bignum_emontredc_8n_maddloop_x0one: + add x2, x2, #0x20 + add x1, x1, #0x20 + mul x17, x4, x8 + mul x22, x5, x9 + mul x23, x6, x10 + mul x24, x7, x11 + umulh x16, x4, x8 + adds x22, x22, x16 + umulh x16, x5, x9 + adcs x23, x23, x16 + umulh x16, x6, x10 + adcs x24, x24, x16 + umulh x16, x7, x11 + adc x25, x16, xzr + ldp x20, x21, [x1] + adds x12, x12, x20 + adcs x13, x13, x21 + ldp x20, x21, [x1, #16] + adcs x14, x14, x20 + adcs x15, x15, x21 + adc x16, xzr, xzr + adds x19, x22, x17 + adcs x22, x23, x22 + adcs x23, x24, x23 + adcs x24, x25, x24 + adc x25, xzr, x25 + adds x20, x22, x17 + adcs x21, x23, x19 + adcs x22, x24, x22 + adcs x23, x25, x23 + adcs x24, xzr, x24 + adc x25, xzr, x25 + adds x17, x17, x12 + adcs x19, x19, x13 + adcs x20, x20, x14 + adcs x21, x21, x15 + adcs x22, x22, x16 + adcs x23, x23, xzr + adcs x24, x24, xzr + adc x25, x25, xzr + subs x15, x6, x7 + cneg x15, x15, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x13, x11, x10 + cneg x13, x13, cc // cc = lo, ul, last + mul x14, x15, x13 + umulh x13, x15, x13 + cinv x12, x12, cc // cc = lo, ul, last + cmn x12, #0x1 + eor x14, x14, x12 + adcs x23, x23, x14 + eor x13, x13, x12 + adcs x24, x24, x13 + adc x25, x25, x12 + subs x15, x4, x5 + cneg x15, x15, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x13, x9, x8 + cneg x13, x13, cc // cc = lo, ul, last + mul x14, x15, x13 + umulh x13, x15, x13 + cinv x12, x12, cc // cc = lo, ul, last + cmn x12, #0x1 + eor x14, x14, x12 + adcs x19, x19, x14 + eor x13, x13, x12 + adcs x20, x20, x13 + adcs x21, x21, x12 + adcs x22, x22, x12 + adcs x23, x23, x12 + adcs x24, x24, x12 + adc x25, x25, x12 + subs x15, x5, x7 + cneg x15, x15, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x13, x11, x9 + cneg x13, x13, cc // cc = lo, ul, last + mul x14, x15, x13 + umulh x13, x15, x13 + cinv x12, x12, cc // cc = lo, ul, last + cmn x12, #0x1 + eor x14, x14, x12 + adcs x22, x22, x14 + eor x13, x13, x12 + adcs x23, x23, x13 + adcs x24, x24, x12 + adc x25, x25, x12 + subs x15, x4, x6 + cneg x15, x15, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x13, x10, x8 + cneg x13, x13, cc // cc = lo, ul, last + mul x14, x15, x13 + umulh x13, x15, x13 + cinv x12, x12, cc // cc = lo, ul, last + cmn x12, #0x1 + eor x14, x14, x12 + adcs x20, x20, x14 + eor x13, x13, x12 + adcs x21, x21, x13 + adcs x22, x22, x12 + adcs x23, x23, x12 + adcs x24, x24, x12 + adc x25, x25, x12 + subs x15, x4, x7 + cneg x15, x15, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x13, x11, x8 + cneg x13, x13, cc // cc = lo, ul, last + mul x14, x15, x13 + umulh x13, x15, x13 + cinv x12, x12, cc // cc = lo, ul, last + cmn x12, #0x1 + eor x14, x14, x12 + adcs x21, x21, x14 + eor x13, x13, x12 + adcs x22, x22, x13 + adcs x23, x23, x12 + adcs x24, x24, x12 + adc x25, x25, x12 + subs x15, x5, x6 + cneg x15, x15, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x13, x10, x9 + cneg x13, x13, cc // cc = lo, ul, last + mul x14, x15, x13 + umulh x13, x15, x13 + cinv x12, x12, cc // cc = lo, ul, last + cmn x12, #0x1 + eor x14, x14, x12 + adcs x21, x21, x14 + eor x13, x13, x12 + adcs x22, x22, x13 + adcs x13, x23, x12 + adcs x14, x24, x12 + adc x15, x25, x12 + mov x12, x22 + stp x17, x19, [x1] + stp x20, x21, [x1, #16] + sub x27, x27, #0x20 + b bignum_emontredc_8n_madddone + + +bignum_emontredc_8n_maddloop_firstitr: mov x16, v25.d[0] //umulh x16,x4,x8 mov x22, v24.d[1] //mul x22, x5, x9 @@ -685,10 +687,10 @@ mov x24, v26.d[1] // lo bits of (x7 * x11) sub x27, x27, #32 cmp x27, #32 - beq bignum_emontredc_8n_neon_maddloop_neon_last + beq bignum_emontredc_8n_maddloop_last -bignum_emontredc_8n_neon_maddloop_neon: +bignum_emontredc_8n_maddloop: ldp x8, x9, [x2, #32] ldp x10, x11, [x2, #48] @@ -918,10 +920,10 @@ mov x24, v26.d[1] // lo bits of (x7 * x11) sub x27, x27, #32 cmp x27, #32 - bne bignum_emontredc_8n_neon_maddloop_neon + bne bignum_emontredc_8n_maddloop -bignum_emontredc_8n_neon_maddloop_neon_last: +bignum_emontredc_8n_maddloop_last: ldp x8, x9, [x2, #32] ldp x10, x11, [x2, #48] @@ -1061,7 +1063,7 @@ mov x17, v24.d[0] // lo bits of (x4 * x8) stp x20,x21,[x1,#16] subs x27, x27, #64 -bignum_emontredc_8n_neon_madddone: +bignum_emontredc_8n_madddone: ldp x17, x19, [x1, #32] ldp x20, x21, [x1, #48] ldp x26, xzr, [sp, #16] @@ -1078,10 +1080,10 @@ bignum_emontredc_8n_neon_madddone: add x1, x1, #32 subs x26, x26, #1 stp x26, xzr, [sp, #16] - bne bignum_emontredc_8n_neon_outerloop + bne bignum_emontredc_8n_outerloop neg x0, x28 -bignum_emontredc_8n_neon_end: +bignum_emontredc_8n_end: add sp, sp, #32 ldp x27, x28, [sp], #16 diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_emontredc_8n_cdiff.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_emontredc_8n_cdiff.S new file mode 100644 index 00000000000..daa52ae40d7 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_emontredc_8n_cdiff.S @@ -0,0 +1,656 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Extend Montgomery reduce in 8-digit blocks, uses an extra storage to +// temporarily cache multiplied differences appearing in ADK. +// Results are stored in input-output buffer (z). +// Inputs z[2*k], m[k], w; +// Outputs function return (extra result bit) and z[2*k] +// Temporary buffer m_precalc[12*(k/4-1)] +// k must be divisible by 8 and not smaller than 16. +// +// extern uint64_t bignum_emontredc_8n_cdiff +// (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w, uint64_t *m_precalc); +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = w, X4 = m_precalc +// returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n_cdiff) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n_cdiff) + .text + .balign 4 + +#define count x27 + +// Helper macro for the pre-computations +#define cdiff(t, c, x, y) subs t, x, y; cneg t, t, cc; csetm c, cc + +// Some immediate offsets for cached differences+carry used +// in the inner ADK multiplications +#define cache_a01 (32+0*16) +#define cache_a02 (32+1*16) +#define cache_a03 (32+2*16) +#define cache_a12 (32+3*16) +#define cache_a13 (32+4*16) +#define cache_a23 (32+5*16) +#define cache_m10 (0*16) +#define cache_m20 (1*16) +#define cache_m30 (2*16) +#define cache_m21 (3*16) +#define cache_m31 (4*16) +#define cache_m32 (5*16) + +#define a0 x4 +#define a1 x5 +#define a2 x6 +#define a3 x7 + +// Registers for precalculation +#define vpre00 v30 +#define vpre01 v28 +#define vpre02 v17 +#define vpre10 v18 +#define vpre11 v19 +#define vpre12 v20 + +#define m x2 + +S2N_BN_SYMBOL(bignum_emontredc_8n_cdiff): + + sub sp, sp, #(10*16) + stp x19, x20, [sp, #(9*16)] + stp x21, x22, [sp, #(8*16)] + stp x23, x24, [sp, #(7*16)] + stp x25, x26, [sp, #(6*16)] + stp x27, x28, [sp, #(5*16)] + stp x29, x30, [sp, #(4*16)] + stp d14, d15, [sp, #(3*16)] + stp d12, d13, [sp, #(2*16)] + stp d10, d11, [sp, #(1*16)] + stp d8, d9, [sp, #(0*16)] + + // Leave space for cached differences of words of a in inner loop + sub sp, sp, #(6*16) + + sub sp, sp, #32 + lsr x0, x0, #2 + mov x26, x0 + subs x12, x0, #1 + bcc bignum_emontredc_8n_cdiff_end + + // x30 = buffer holding precomputed ADK carry-differences for modulus + + // + // Start of precomputation + // + // Precompute and cache signed differences of modulus components + // used in the ADK multiplication in the inner loop. + // + + // Number of extra limbs required: + // 6 * (number of limbs / 4 - 1) * 2 = 12 * (number_of_limbs/4 - 1) + // + mov x24, x4 + mov x30, x4 + + // Save modulus pointer + mov x25, m + + mov count, x12 + +bignum_emontredc_8n_cdiff_precomp: + ldp a0, a1, [m, #32]! + ldp a2, a3, [m, #16] + +#define t x28 +#define c x29 + + cdiff(t, c, a1, a0) + stp t, c, [x30, #cache_m10] + cdiff(t, c, a2, a0) + stp t, c, [x30, #cache_m20] + cdiff(t, c, a3, a0) + stp t, c, [x30, #cache_m30] + cdiff(t, c, a2, a1) + stp t, c, [x30, #cache_m21] + cdiff(t, c, a3, a1) + stp t, c, [x30, #cache_m31] + cdiff(t, c, a3, a2) + stp t, c, [x30, #cache_m32] + + add x30, x30, #(6*16) + + subs count, count, #1 + cbnz count, bignum_emontredc_8n_cdiff_precomp + + // Set modulus pointer and buffer pointer back to its original value + mov m, x25 + mov x30, x24 + + // + // End of precomputation + // + + stp x3, x30, [sp] + stp x26, xzr, [sp, #16] + mov x28, xzr + lsl x0, x12, #5 + + movi v29.2d, #0x000000ffffffff + +bignum_emontredc_8n_cdiff_outerloop: + ldp x9, x13, [x1, #0] // .*.................................................................................................................................................................................................................. + ldr x3, [sp] // *................................................................................................................................................................................................................... + lsr x27, x0, #5 // ......................................................................................................................................*............................................................................. + sub x27, x27, #1 // ...................................................................................................................................................................................................................* + ldp x10, x12, [x1, #16] // ..*................................................................................................................................................................................................................. + ldp x4, x15, [x2, #0] // ...*................................................................................................................................................................................................................ + ldr q1, [x2, #16] // .....*.............................................................................................................................................................................................................. + mul x11, x9, x3 // ......*............................................................................................................................................................................................................. + uzp2 v18.4S, v1.4S, v1.4S // ........*........................................................................................................................................................................................................... + dup v27.2D, x11 // .......*............................................................................................................................................................................................................ + xtn v13.2S, v1.2D // ..........*......................................................................................................................................................................................................... + rev64 v9.4S, v1.4S // ...........*........................................................................................................................................................................................................ + mul x7, x11, x4 // ...........................*........................................................................................................................................................................................ + rev64 v2.4S, v1.4S // ....................................................................................*............................................................................................................................... + uzp2 v20.4S, v1.4S, v1.4S // .................................................................................*.................................................................................................................................. + mul v31.4S, v9.4S, v27.4S // ...............*.................................................................................................................................................................................................... + xtn v14.2S, v1.2D // ..............................................*..................................................................................................................................................................... + uzp2 v21.4S, v1.4S, v1.4S // ............................................*....................................................................................................................................................................... + umulh x22, x11, x15 // ................................*................................................................................................................................................................................... + xtn v17.2S, v27.2D // .........*.......................................................................................................................................................................................................... + adds x19, x9, x7 // ............................*....................................................................................................................................................................................... + umull v28.2D, v17.2S, v13.2S // ............*....................................................................................................................................................................................................... + umull v26.2D, v17.2S, v18.2S // .............*...................................................................................................................................................................................................... + uaddlp v8.2D, v31.4S // ..................*................................................................................................................................................................................................. + umulh x8, x11, x4 // .............................*...................................................................................................................................................................................... + shl v7.2D, v8.2D, #32 // .....................*.............................................................................................................................................................................................. + uzp2 v30.4S, v27.4S, v27.4S // ..............*..................................................................................................................................................................................................... + umlal v7.2D, v17.2S, v13.2S // .......................*............................................................................................................................................................................................ + mul x14, x11, x15 // ..............................*..................................................................................................................................................................................... + usra v26.2D, v28.2D, #32 // ................*................................................................................................................................................................................................... + umull v12.2D, v30.2S, v18.2S // .................*.................................................................................................................................................................................................. + mov x24, v7.d[0] // .........................*.......................................................................................................................................................................................... + adcs x29, x13, x14 // ...............................*.................................................................................................................................................................................... + and v4.16B, v26.16B, v29.16B // ...................*................................................................................................................................................................................................ + mov x17, v7.d[1] // ..........................*......................................................................................................................................................................................... + rev64 v27.4S, v1.4S // ...............................................*.................................................................................................................................................................... + adcs x5, x10, x24 // .................................*.................................................................................................................................................................................. + umlal v4.2D, v30.2S, v13.2S // ....................*............................................................................................................................................................................................... + usra v12.2D, v26.2D, #32 // ......................*............................................................................................................................................................................................. + adcs x14, x12, x17 // ..................................*................................................................................................................................................................................. + adc x23, xzr, xzr // .....................................*.............................................................................................................................................................................. + adds x8, x29, x8 // ......................................*............................................................................................................................................................................. + adcs x7, x5, x22 // .......................................*............................................................................................................................................................................ + mul x25, x8, x3 // ..........................................*......................................................................................................................................................................... + usra v12.2D, v4.2D, #32 // ........................*........................................................................................................................................................................................... + dup v8.2D, x25 // ...........................................*........................................................................................................................................................................ + stp x11, x25, [x1, #0] // ..............................................................................*..................................................................................................................................... + mul x22, x25, x4 // ...............................................................*.................................................................................................................................................... + mov x16, v12.d[1] // ....................................*............................................................................................................................................................................... + ldr q16, [x1, #0] // .......................................................................................................................................*............................................................................ + mov x21, v12.d[0] // ...................................*................................................................................................................................................................................ + mul v31.4S, v27.4S, v8.4S // ...................................................*................................................................................................................................................................ + adcs x20, x14, x21 // ........................................*........................................................................................................................................................................... + xtn v27.2S, v8.2D // .............................................*...................................................................................................................................................................... + adc x10, x23, x16 // .........................................*.......................................................................................................................................................................... + subs x14, x11, x25 // .........................................................................................................................................*.......................................................................... + rev64 v17.4S, v16.4S // ...................................................................................................................................................................*................................................ + cneg x17, x14, cc // ..........................................................................................................................................*......................................................................... + csetm x26, cc // ...........................................................................................................................................*........................................................................ + uaddlp v26.2D, v31.4S // ......................................................*............................................................................................................................................................. + mul x6, x25, x15 // ..................................................................*................................................................................................................................................. + stp x17, x26, [sp, #cache_a01] // ............................................................................................................................................*....................................................................... + umull v24.2D, v27.2S, v14.2S // ................................................*................................................................................................................................................................... + uzp2 v30.4S, v16.4S, v16.4S // .................................................................................................................................................................*.................................................. + shl v4.2D, v26.2D, #32 // .........................................................*.......................................................................................................................................................... + uzp2 v5.4S, v8.4S, v8.4S // ..................................................*................................................................................................................................................................. + umulh x17, x25, x4 // .................................................................*.................................................................................................................................................. + umlal v4.2D, v27.2S, v14.2S // ...........................................................*........................................................................................................................................................ + umull v8.2D, v27.2S, v21.2S // .................................................*.................................................................................................................................................................. + mov x21, v4.d[0] // .............................................................*...................................................................................................................................................... + adds x8, x8, x22 // ................................................................*................................................................................................................................................... + mov x12, v4.d[1] // ..............................................................*..................................................................................................................................................... + ldp x23, x14, [x2, #16] // ....*............................................................................................................................................................................................................... + adcs x29, x7, x6 // ...................................................................*................................................................................................................................................ + umulh x13, x25, x15 // ....................................................................*............................................................................................................................................... + usra v8.2D, v24.2D, #32 // ....................................................*............................................................................................................................................................... + ldp x8, x24, [x30, #cache_m20] // ...........................................................................................................................................................................................................*........ + adcs x9, x20, x21 // .....................................................................*.............................................................................................................................................. + ldr q9, [x2, #32]! // .......................................................................................................................................................................*............................................ + xtn v28.2S, v16.2D // ..................................................................................................................................................................*................................................. + adcs x19, x10, x12 // ......................................................................*............................................................................................................................................. + ldr q13, [x2, #16] // ........................................................................................................................................................................*........................................... + umull v18.2D, v5.2S, v21.2S // .....................................................*.............................................................................................................................................................. + adc x7, xzr, xzr // .........................................................................*.......................................................................................................................................... + adds x5, x29, x17 // ..........................................................................*......................................................................................................................................... + xtn v21.2S, v1.2D // ...................................................................................*................................................................................................................................ + mul x12, x5, x3 // ...............................................................................*.................................................................................................................................... + and v4.16B, v8.16B, v29.16B // .......................................................*............................................................................................................................................................ + adcs x21, x9, x13 // ...........................................................................*........................................................................................................................................ + uzp2 v31.4S, v9.4S, v9.4S // ...............................................................................................................................................................................*.................................... + xtn v23.2S, v9.2D // .........................................................................................................................................................................*.......................................... + usra v18.2D, v8.2D, #32 // ..........................................................*......................................................................................................................................................... + umlal v4.2D, v5.2S, v14.2S // ........................................................*........................................................................................................................................................... + dup v5.2D, x12 // ................................................................................*................................................................................................................................... + umull v16.2D, v23.2S, v30.2S // ............................................................................................................................................................................*....................................... + umull v1.2D, v23.2S, v28.2S // ..............................................................................................................................................................................*..................................... + umulh x29, x12, x15 // .........................................................................................................*.......................................................................................................... + umull v8.2D, v31.2S, v30.2S // ....................................................................................................................................................................................*............................... + xtn v24.2S, v13.2D // ..........................................................................................................................................................................*......................................... + mul v25.4S, v2.4S, v5.4S // ........................................................................................*........................................................................................................................... + usra v18.2D, v4.2D, #32 // ............................................................*....................................................................................................................................................... + xtn v3.2S, v5.2D // ..................................................................................*................................................................................................................................. + uzp2 v19.4S, v5.4S, v5.4S // .......................................................................................*............................................................................................................................ + mul x10, x12, x15 // .......................................................................................................*............................................................................................................ + umull v26.2D, v3.2S, v20.2S // ......................................................................................*............................................................................................................................. + mov x22, v18.d[0] // .......................................................................*............................................................................................................................................ + umull v10.2D, v3.2S, v21.2S // .....................................................................................*.............................................................................................................................. + uaddlp v11.2D, v25.4S // ...........................................................................................*........................................................................................................................ + mov x6, v18.d[1] // ........................................................................*........................................................................................................................................... + mul x16, x12, x4 // ....................................................................................................*............................................................................................................... + umull v4.2D, v19.2S, v20.2S // ..........................................................................................*......................................................................................................................... + usra v16.2D, v1.2D, #32 // ...................................................................................................................................................................................*................................ + adcs x13, x19, x22 // ............................................................................*....................................................................................................................................... + shl v11.2D, v11.2D, #32 // ..............................................................................................*..................................................................................................................... + adc x6, x7, x6 // .............................................................................*...................................................................................................................................... + subs x7, x11, x12 // .............................................................................................................................................*...................................................................... + usra v26.2D, v10.2D, #32 // .........................................................................................*.......................................................................................................................... + csetm x26, cc // ...............................................................................................................................................*.................................................................... + cneg x20, x7, cc // ..............................................................................................................................................*..................................................................... + subs x19, x25, x12 // .....................................................................................................................................................*.............................................................. + umlal v11.2D, v3.2S, v21.2S // ................................................................................................*................................................................................................................... + cneg x9, x19, cc // ......................................................................................................................................................*............................................................. + stp x20, x26, [sp, #cache_a02] // ................................................................................................................................................*................................................................... + umulh x7, x12, x4 // ......................................................................................................*............................................................................................................. + usra v8.2D, v16.2D, #32 // ........................................................................................................................................................................................*........................... + mul v7.4S, v17.4S, v9.4S // ................................................................................................................................................................................................*................... + csetm x26, cc // .......................................................................................................................................................*............................................................ + adds x19, x5, x16 // .....................................................................................................*.............................................................................................................. + and v1.16B, v16.16B, v29.16B // .......................................................................................................................................................................................*............................ + adcs x21, x21, x10 // ........................................................................................................*........................................................................................................... + stp x9, x26, [sp, #cache_a12] // ........................................................................................................................................................*........................................................... + ldp x17, x20, [sp, #cache_a02] // .....................................................................................................................................................................................................*.............. + usra v4.2D, v26.2D, #32 // ...............................................................................................*.................................................................................................................... + and v18.16B, v26.16B, v29.16B // ............................................................................................*....................................................................................................................... + umlal v1.2D, v31.2S, v28.2S // ..........................................................................................................................................................................................*......................... + mov x22, v11.d[0] // ..................................................................................................*................................................................................................................. + mov x16, v11.d[1] // ...................................................................................................*................................................................................................................ + umlal v18.2D, v19.2S, v21.2S // .............................................................................................*...................................................................................................................... + adcs x19, x13, x22 // ..........................................................................................................*......................................................................................................... + mul x22, x17, x8 // ...............................................................................................................................................................................................................*.... + uaddlp v5.2D, v7.4S // ..................................................................................................................................................................................................*................. + adcs x13, x6, x16 // ...........................................................................................................*........................................................................................................ + usra v8.2D, v1.2D, #32 // ..............................................................................................................................................................................................*..................... + adc x9, xzr, xzr // ..............................................................................................................*..................................................................................................... + adds x5, x21, x7 // ...............................................................................................................*.................................................................................................... + usra v4.2D, v18.2D, #32 // .................................................................................................*.................................................................................................................. + adcs x6, x19, x29 // .................................................................................................................*.................................................................................................. + mul x19, x5, x3 // ................................................................................................................*................................................................................................... + shl v15.2D, v5.2D, #32 // ......................................................................................................................................................................................................*............. + mov x3, v8.d[1] // ...................................................................................................................................................................................................*................ + umlal v15.2D, v23.2S, v28.2S // .......................................................................................................................................................................................................*............ + mov x21, v4.d[0] // ............................................................................................................*....................................................................................................... + mul x7, x19, x23 // .......................................................................................................................*............................................................................................ + stp x12, x19, [x1, #16] // ....................................................................................................................*............................................................................................... + mov x10, v4.d[1] // .............................................................................................................*...................................................................................................... + ldr q9, [x1, #16] // ........................................................................................................................................*........................................................................... + adcs x13, x13, x21 // ..................................................................................................................*................................................................................................. + mov x21, v15.d[1] // ............................................................................................................................................................................................................*....... + mul x16, x19, x4 // .....................................................................................................................*.............................................................................................. + adc x9, x9, x10 // ...................................................................................................................*................................................................................................ + subs x29, x25, x19 // .........................................................................................................................................................*.......................................................... + csetm x26, cc // ...........................................................................................................................................................*........................................................ + cneg x10, x29, cc // ..........................................................................................................................................................*......................................................... + subs x29, x12, x19 // .............................................................................................................................................................*...................................................... + stp x10, x26, [sp, #cache_a13] // ............................................................................................................................................................*....................................................... + uzp2 v18.4S, v9.4S, v9.4S // ....................................................................................................................................................................*............................................... + mul x12, x19, x15 // ......................................................................................................................*............................................................................................. + rev64 v20.4S, v9.4S // ......................................................................................................................................................................*............................................. + xtn v19.2S, v9.2D // .....................................................................................................................................................................*.............................................. + umull v25.2D, v24.2S, v18.2S // .............................................................................................................................................................................*...................................... + csetm x26, cc // ...............................................................................................................................................................*.................................................... + umull v14.2D, v24.2S, v19.2S // ................................................................................................................................................................................*................................... + cneg x29, x29, cc // ..............................................................................................................................................................*..................................................... + umulh x10, x19, x23 // ..............................................................................................................................*..................................................................................... + adds x25, x5, x16 // .........................................................................................................................*.......................................................................................... + mul v7.4S, v20.4S, v13.4S // .................................................................................................................................................................................*.................................. + adcs x12, x6, x12 // ...........................................................................................................................*........................................................................................ + ldp x6, x5, [sp, #cache_a01] // ..........................................................................................................................................................................................................*......... + mov x16, v8.d[0] // .........................................................................................................................................................................................................*.......... + adcs x25, x13, x7 // .............................................................................................................................*...................................................................................... + stp x29, x26, [sp, #cache_a23] // ................................................................................................................................................................*................................................... + usra v25.2D, v14.2D, #32 // .....................................................................................................................................................................................*.............................. + mul x29, x19, x14 // ........................................................................................................................*........................................................................................... + uzp2 v1.4S, v13.4S, v13.4S // ...........................................................................................................................................................................*........................................ + uaddlp v7.2D, v7.4S // ......................................................................................................................................................................................*............................. + umull v0.2D, v1.2S, v18.2S // ..................................................................................................................................................................................*................................. + umulh x13, x19, x4 // ..........................................................................................................................*......................................................................................... + and v10.16B, v25.16B, v29.16B // .........................................................................................................................................................................................*.......................... + shl v13.2D, v7.2D, #32 // ............................................................................................................................................................................................*....................... + adcs x4, x9, x29 // ...............................................................................................................................*.................................................................................... + umlal v10.2D, v1.2S, v19.2S // .............................................................................................................................................................................................*...................... + adc x9, xzr, xzr // .................................................................................................................................*.................................................................................. + subs x29, x11, x19 // .................................................................................................................................................*.................................................................. + usra v0.2D, v25.2D, #32 // ...........................................................................................................................................................................................*........................ + eor x11, x20, x24 // ................................................................................................................................................................................................................*... + umulh x15, x19, x15 // ............................................................................................................................*....................................................................................... + umlal v13.2D, v24.2S, v19.2S // ...............................................................................................................................................................................................*.................... + cneg x7, x29, cc // ..................................................................................................................................................*................................................................. + ldp x20, x29, [x1, #32]! // .................................................................................................................................................................................................................*.. + csetm x26, cc // ...................................................................................................................................................*................................................................ + usra v0.2D, v10.2D, #32 // .................................................................................................................................................................................................*.................. + umulh x19, x19, x14 // ................................................................................................................................*................................................................................... + mov x23, v13.d[1] // .............................................................................................................................................................................................................*...... + stp x7, x26, [sp, #cache_a03] // ....................................................................................................................................................*............................................................... + adds x12, x12, x13 // ..................................................................................................................................*................................................................................. + adcs x13, x25, x15 // ...................................................................................................................................*................................................................................ + mov x26, v0.d[0] // ....................................................................................................................................................................................................*............... + umulh x8, x17, x8 // ..................................................................................................................................................................................................................*. + adcs x14, x4, x10 // ....................................................................................................................................*............................................................................... + mov x17, v13.d[0] // ........................................................................................................................................................................................................*........... + adc x15, x9, x19 // .....................................................................................................................................*.............................................................................. + ldp x24, x10, [x30], #96 // ..............................................................................................................................................................................................................*..... + +bignum_emontredc_8n_cdiff_maddloop_neon: + + ldr q14, [x2, #32]! // e.................................................................................................................................................... + ldr q25, [x2, #16] // .e................................................................................................................................................... + eor x19, x5, x10 // .................................................................................*................................................................... + adds x25, x21, x16 // .....................................*............................................................................................................... + mov x16, v0.d[1] // .................................*................................................................................................................... + ldp x4, x7, [x1, #16] // .............................................*....................................................................................................... + adcs x21, x17, x3 // ......................................*.............................................................................................................. + eor x22, x22, x11 // .................................................................................................................*................................... + adcs x23, x23, x26 // .......................................*............................................................................................................. + adc x17, x16, xzr // ........................................*............................................................................................................ + adds x16, x12, x20 // ...........................................*......................................................................................................... + mul x5, x6, x24 // ..................................................................................*.................................................................. + xtn v21.2S, v14.2D // ..e.................................................................................................................................................. + xtn v31.2S, v25.2D // ................e.................................................................................................................................... + adcs x9, x13, x29 // ............................................*........................................................................................................ + uzp2 v24.4S, v25.4S, v25.4S // ...................e................................................................................................................................. + mov x29, v15.d[0] // .........................................*........................................................................................................... + adcs x4, x14, x4 // ..............................................*...................................................................................................... + ldp x10, x13, [sp, #cache_a23] // ....................................................................*................................................................................ + umull v5.2D, v21.2S, v30.2S // ....e................................................................................................................................................ + umulh x20, x6, x24 // ...................................................................................*................................................................. + adcs x24, x15, x7 // ...............................................*..................................................................................................... + ldp x12, x7, [x30, #cache_m32 - 96] // .....................................................................*............................................................................... + umull v16.2D, v31.2S, v18.2S // ..................e.................................................................................................................................. + adc x6, xzr, xzr // ................................................*.................................................................................................... + adds x14, x25, x29 // .................................................*................................................................................................... + umull v13.2D, v21.2S, v28.2S // ...e................................................................................................................................................. + uzp2 v10.4S, v14.4S, v14.4S // .....e............................................................................................................................................... + eor x15, x8, x11 // ...................................................................................................................*................................. + adcs x25, x21, x25 // ..................................................*.................................................................................................. + umull v1.2D, v31.2S, v19.2S // .................e................................................................................................................................... + adcs x8, x23, x21 // ...................................................*................................................................................................. + mul v6.4S, v20.4S, v25.4S // ....................e................................................................................................................................ + eor x7, x13, x7 // ......................................................................*.............................................................................. + adcs x23, x17, x23 // ....................................................*................................................................................................ + eor x21, x5, x19 // .....................................................................................*............................................................... + adc x13, xzr, x17 // .....................................................*............................................................................................... + adds x17, x25, x29 // ......................................................*.............................................................................................. + umull v0.2D, v24.2S, v18.2S // ......................e.............................................................................................................................. + usra v5.2D, v13.2D, #32 // .......e............................................................................................................................................. + adcs x5, x8, x14 // .......................................................*............................................................................................. + umull v2.2D, v10.2S, v30.2S // ........e............................................................................................................................................ + adcs x25, x23, x25 // ........................................................*............................................................................................ + usra v16.2D, v1.2D, #32 // .....................e............................................................................................................................... + adcs x8, x13, x8 // .........................................................*........................................................................................... + uaddlp v13.2D, v6.4S // .......................e............................................................................................................................. + adcs x23, xzr, x23 // ..........................................................*.......................................................................................... + and v7.16B, v5.16B, v29.16B // ..........e.......................................................................................................................................... + adc x13, xzr, x13 // ...........................................................*......................................................................................... + adds x29, x29, x16 // ............................................................*........................................................................................ + mul x16, x10, x12 // .......................................................................*............................................................................. + usra v2.2D, v5.2D, #32 // .............e....................................................................................................................................... + adcs x9, x14, x9 // .............................................................*....................................................................................... + and v25.16B, v16.16B, v29.16B // ........................e............................................................................................................................ + adcs x17, x17, x4 // ..............................................................*...................................................................................... + umlal v7.2D, v10.2S, v28.2S // ...........e......................................................................................................................................... + umulh x12, x10, x12 // ........................................................................*............................................................................ + adcs x10, x5, x24 // ...............................................................*..................................................................................... + usra v0.2D, v16.2D, #32 // ...........................e......................................................................................................................... + eor x5, x16, x7 // ..........................................................................*.......................................................................... + ldp x16, x14, [x30, #cache_m31 - 96] // ................................................................................................*.................................................... + adcs x6, x25, x6 // ................................................................*.................................................................................... + shl v16.2D, v13.2D, #32 // ..........................e.......................................................................................................................... + eor x24, x20, x19 // .......................................................................................*............................................................. + adcs x4, x8, xzr // .................................................................*................................................................................... + ldp x20, x25, [sp, #cache_a13] // ...............................................................................................*..................................................... + umlal v25.2D, v24.2S, v19.2S // .........................e........................................................................................................................... + adcs x23, x23, xzr // ..................................................................*.................................................................................. + usra v2.2D, v7.2D, #32 // ...............e..................................................................................................................................... + umlal v16.2D, v31.2S, v19.2S // ............................e........................................................................................................................ + adc x8, x13, xzr // ...................................................................*................................................................................. + adds xzr, x7, #1 // .........................................................................*........................................................................... + mul v7.4S, v17.4S, v14.4S // ......e.............................................................................................................................................. + adcs x4, x4, x5 // ...........................................................................*......................................................................... + eor x5, x12, x7 // ............................................................................*........................................................................ + adcs x23, x23, x5 // .............................................................................*....................................................................... + mul x12, x20, x16 // ..................................................................................................*.................................................. + adc x5, x8, x7 // ..............................................................................*...................................................................... + adds xzr, x19, #1 // ....................................................................................*................................................................ + adcs x21, x9, x21 // ......................................................................................*.............................................................. + eor x8, x25, x14 // .................................................................................................*................................................... + usra v0.2D, v25.2D, #32 // .............................e....................................................................................................................... + adcs x13, x17, x24 // ........................................................................................*............................................................ + stp x29, x21, [x1, #0] // ..............................................................................................*...................................................... + umulh x20, x20, x16 // ...................................................................................................*................................................. + uaddlp v10.2D, v7.4S // .........e........................................................................................................................................... + adcs x17, x10, x19 // .........................................................................................*........................................................... + mov x3, v2.d[1] // ................................e.................................................................................................................... + ldp x29, x24, [sp, #cache_a03] // .........................................................................................................................*........................... + adcs x25, x6, x19 // ..........................................................................................*.......................................................... + ldp x6, x21, [x30, #cache_m30 - 96] // ..........................................................................................................................*.......................... + eor x10, x12, x8 // .....................................................................................................*............................................... + adcs x9, x4, x19 // ...........................................................................................*......................................................... + mov x26, v0.d[0] // ...............................e..................................................................................................................... + ldp x4, x16, [x30, #cache_m21 - 96] // .......................................................................................................................................*............. + adcs x12, x23, x19 // ............................................................................................*........................................................ + adc x5, x5, x19 // .............................................................................................*....................................................... + adds xzr, x8, #1 // ....................................................................................................*................................................ + ldp x7, x19, [sp, #cache_a12] // ......................................................................................................................................*.............. + adcs x14, x25, x10 // ......................................................................................................*.............................................. + mul x25, x29, x6 // ............................................................................................................................*........................ + eor x20, x20, x8 // .......................................................................................................*............................................. + adcs x23, x9, x20 // ........................................................................................................*............................................ + ldp x9, x20, [sp, #cache_a02] // ...........................................................................................................e......................................... + eor x24, x24, x21 // ...........................................................................................................................*......................... + adcs x12, x12, x8 // .........................................................................................................*........................................... + adc x10, x5, x8 // ..........................................................................................................*.......................................... + adds xzr, x11, #1 // ................................................................................................................*.................................... + umulh x5, x29, x6 // .............................................................................................................................*....................... + shl v15.2D, v10.2D, #32 // ............e........................................................................................................................................ + adcs x8, x13, x22 // ..................................................................................................................*.................................. + eor x13, x25, x24 // ...............................................................................................................................*..................... + adcs x29, x17, x15 // ....................................................................................................................*................................ + umlal v15.2D, v21.2S, v28.2S // ..............e...................................................................................................................................... + adcs x22, x14, x11 // .....................................................................................................................*............................... + mov x17, v16.d[0] // ...................................e................................................................................................................. + adcs x21, x23, x11 // ......................................................................................................................*.............................. + mul x23, x7, x4 // .........................................................................................................................................*........... + adcs x14, x12, x11 // .......................................................................................................................*............................. + eor x12, x19, x16 // ........................................................................................................................................*............ + mov x16, v2.d[0] // ..............................e...................................................................................................................... + adc x15, x10, x11 // ........................................................................................................................*............................ + adds xzr, x24, #1 // ..............................................................................................................................*...................... + eor x19, x5, x24 // .................................................................................................................................*................... + adcs x11, x29, x13 // ................................................................................................................................*.................... + umulh x29, x7, x4 // ..........................................................................................................................................*.......... + adcs x13, x22, x19 // ..................................................................................................................................*.................. + ldp x6, x5, [sp, #cache_a01] // ...............................................................................e..................................................................... + ldp x7, x25, [x30, #cache_m20] // ............................................................................................................e........................................ + adcs x19, x21, x24 // ...................................................................................................................................*................. + mov x21, v15.d[1] // ..................................e.................................................................................................................. + eor x22, x23, x12 // ............................................................................................................................................*........ + adcs x14, x14, x24 // ....................................................................................................................................*................ + mov x23, v16.d[1] // ....................................e................................................................................................................ + adc x15, x15, x24 // .....................................................................................................................................*............... + adds xzr, x12, #1 // ...........................................................................................................................................*......... + ldp x24, x10, [x30], #96 // ................................................................................e.................................................................... + adcs x11, x11, x22 // .............................................................................................................................................*....... + mul x22, x9, x7 // ..............................................................................................................e...................................... + eor x4, x29, x12 // ...............................................................................................................................................*..... + adcs x4, x13, x4 // ................................................................................................................................................*.... + stp x8, x11, [x1, #16] // ..............................................................................................................................................*...... + adcs x13, x19, x12 // .................................................................................................................................................*... + eor x11, x20, x25 // .............................................................................................................e....................................... + ldp x20, x29, [x1, #32]! // ..........................................e.......................................................................................................... + adcs x14, x14, x12 // ..................................................................................................................................................*.. + adc x15, x15, x12 // ...................................................................................................................................................*. + mov x12, x4 // ....................................................................................................................................................* + umulh x8, x9, x7 // ...............................................................................................................e..................................... + + sub count, count, #1 + cbnz count, bignum_emontredc_8n_cdiff_maddloop_neon +bignum_emontredc_8n_cdiff_inner_loop_postamble: + umulh x19, x6, x24 // ..............*........................................................................................................... + ldp x7, x9, [sp, #cache_a23] // .............*............................................................................................................ + adds x4, x21, x16 // .*........................................................................................................................ + mov x25, v0.d[1] // ..*....................................................................................................................... + eor x5, x5, x10 // *......................................................................................................................... + adcs x17, x17, x3 // ....*..................................................................................................................... + ldp x16, x10, [x1, #16] // ...*...................................................................................................................... + adcs x21, x23, x26 // ......*................................................................................................................... + eor x8, x8, x11 // ...................*...................................................................................................... + adc x23, x25, xzr // .......*.................................................................................................................. + adds x20, x12, x20 // ........*................................................................................................................. + adcs x12, x13, x29 // ..........*............................................................................................................... + mov x25, v15.d[0] // ...........*.............................................................................................................. + adcs x13, x14, x16 // ............*............................................................................................................. + eor x16, x19, x5 // .........................................*................................................................................ + adcs x29, x15, x10 // ...............*.......................................................................................................... + ldp x14, x19, [x30, #cache_m32 - 96] // ................*......................................................................................................... + mul x15, x6, x24 // .........*................................................................................................................ + adc x24, xzr, xzr // .................*........................................................................................................ + adds x6, x4, x25 // ..................*....................................................................................................... + adcs x10, x17, x4 // ....................*..................................................................................................... + eor x4, x22, x11 // .....*.................................................................................................................... + adcs x17, x21, x17 // .....................*.................................................................................................... + eor x22, x9, x19 // ......................*................................................................................................... + adcs x9, x23, x21 // .......................*.................................................................................................. + adc x21, xzr, x23 // .........................*................................................................................................ + adds x23, x10, x25 // ..........................*............................................................................................... + eor x15, x15, x5 // ........................*................................................................................................. + adcs x19, x17, x6 // ...........................*.............................................................................................. + ldp x26, xzr, [sp, #16] // ...........................................................................................................*.............. + sub x2, x2, x0 // .....................................................................................................................*.... + adcs x10, x9, x10 // ............................*............................................................................................. + adcs x17, x21, x17 // .............................*............................................................................................ + adcs x9, xzr, x9 // ..............................*........................................................................................... + adc x21, xzr, x21 // ...............................*.......................................................................................... + adds x25, x25, x20 // ................................*......................................................................................... + mul x20, x7, x14 // .................................*........................................................................................ + adcs x6, x6, x12 // ..................................*....................................................................................... + adcs x23, x23, x13 // ...................................*...................................................................................... + adcs x13, x19, x29 // .....................................*.................................................................................... + umulh x19, x7, x14 // ....................................*..................................................................................... + ldp x14, x29, [x30, #cache_m31 - 96] // .......................................*.................................................................................. + adcs x10, x10, x24 // ........................................*................................................................................. + ldp x12, x7, [sp, #cache_a13] // ...........................................*.............................................................................. + adcs x17, x17, xzr // ..........................................*............................................................................... + adcs x24, x9, xzr // ............................................*............................................................................. + adc x9, x21, xzr // .............................................*............................................................................ + adds xzr, x22, #1 // ..............................................*........................................................................... + eor x21, x20, x22 // ......................................*................................................................................... + adcs x20, x17, x21 // ...............................................*.......................................................................... + eor x17, x19, x22 // ................................................*......................................................................... + mul x19, x12, x14 // ..................................................*....................................................................... + eor x29, x7, x29 // ......................................................*................................................................... + adcs x17, x24, x17 // .................................................*........................................................................ + adc x9, x9, x22 // ...................................................*...................................................................... + adds xzr, x5, #1 // ....................................................*..................................................................... + umulh x21, x12, x14 // .........................................................*................................................................ + ldp x14, x24, [sp, #cache_a03] // ...........................................................*.............................................................. + adcs x6, x6, x15 // .....................................................*.................................................................... + adcs x7, x23, x16 // .......................................................*.................................................................. + ldp x15, x12, [x30, #cache_m30 - 96] // .............................................................*............................................................ + eor x19, x19, x29 // ..............................................................*........................................................... + adcs x13, x13, x5 // ..........................................................*............................................................... + stp x25, x6, [x1, #0] // ........................................................*................................................................. + adcs x16, x10, x5 // ............................................................*............................................................. + ldp x10, x6, [x30, #cache_m21 - 96] // ................................................................*......................................................... + adcs x23, x20, x5 // ...............................................................*.......................................................... + adcs x25, x17, x5 // .................................................................*........................................................ + umulh x30, x14, x15 // .............................................................................*............................................ + ldp x17, x22, [sp, #cache_a12] // ....................................................................*..................................................... + adc x9, x9, x5 // ..................................................................*....................................................... + adds xzr, x29, #1 // ...................................................................*...................................................... + eor x21, x21, x29 // .......................................................................*.................................................. + adcs x16, x16, x19 // .....................................................................*.................................................... + adcs x5, x23, x21 // ........................................................................*................................................. + eor x23, x24, x12 // .........................................................................*................................................ + mul x12, x17, x10 // ...................................................................................*...................................... + adcs x19, x25, x29 // ..........................................................................*............................................... + adc x29, x9, x29 // ...........................................................................*.............................................. + adds xzr, x11, #1 // ............................................................................*............................................. + adcs x7, x7, x4 // ..............................................................................*........................................... + adcs x21, x13, x8 // ................................................................................*......................................... + mul x4, x14, x15 // ......................................................................*................................................... + eor x9, x30, x23 // ........................................................................................*................................. + adcs x30, x16, x11 // .................................................................................*........................................ + adcs x24, x5, x11 // ..................................................................................*....................................... + adcs x16, x19, x11 // ....................................................................................*..................................... + adc x19, x29, x11 // ......................................................................................*................................... + adds xzr, x23, #1 // .......................................................................................*.................................. + eor x8, x4, x23 // ...............................................................................*.......................................... + adcs x4, x21, x8 // .........................................................................................*................................ + umulh x21, x17, x10 // ..........................................................................................*............................... + adcs x8, x30, x9 // ...........................................................................................*.............................. + ldp x10, x30, [x1, #32] // .........................................................................................................*................ + adcs x25, x24, x23 // ............................................................................................*............................. + eor x11, x22, x6 // .....................................................................................*.................................... + adcs x22, x16, x23 // ..............................................................................................*........................... + eor x5, x12, x11 // .............................................................................................*............................ + adc x29, x19, x23 // ...............................................................................................*.......................... + adds xzr, x11, #1 // ................................................................................................*......................... + eor x14, x21, x11 // ..................................................................................................*....................... + adcs x9, x4, x5 // .................................................................................................*........................ + stp x7, x9, [x1, #16] // ....................................................................................................*..................... + adcs x9, x8, x14 // ...................................................................................................*...................... + adcs x19, x25, x11 // .....................................................................................................*.................... + ldp x8, x21, [x1, #48] // ..........................................................................................................*............... + mov x24, x9 // ........................................................................................................*................. + adcs x5, x22, x11 // ......................................................................................................*................... + adc x16, x29, x11 // .......................................................................................................*.................. + adds xzr, x28, x28 // ............................................................................................................*............. + adcs x17, x10, x24 // .............................................................................................................*............ + adcs x14, x30, x19 // ..............................................................................................................*........... + ldr x30, [sp, #8] // .........................................................................................................................* + adcs x8, x8, x5 // ...............................................................................................................*.......... + stp x17, x14, [x1, #32] // ..................................................................................................................*....... + adcs x9, x21, x16 // ................................................................................................................*......... + csetm x28, cs // .................................................................................................................*........ + stp x8, x9, [x1, #48] // ...................................................................................................................*...... + sub x26, x26, #1 // .......................................................................................................................*.. + sub x1, x1, x0 // ....................................................................................................................*..... + stp x26, xzr, [sp, #16] // ........................................................................................................................*. + add x1, x1, #32 // ......................................................................................................................*... + +bignum_emontredc_8n_cdiff_outer_loop_end: + + cbnz x26, bignum_emontredc_8n_cdiff_outerloop + neg x0, x28 + +bignum_emontredc_8n_cdiff_end: + add sp, sp, #32 + add sp, sp, #(6*16) + ldp d8, d9, [sp, #(0*16)] + ldp d10, d11, [sp, #(1*16)] + ldp d12, d13, [sp, #(2*16)] + ldp d14, d15, [sp, #(3*16)] + ldp x29, x30, [sp, #(4*16)] + ldp x27, x28, [sp, #(5*16)] + ldp x25, x26, [sp, #(6*16)] + ldp x23, x24, [sp, #(7*16)] + ldp x21, x22, [sp, #(8*16)] + ldp x19, x20, [sp, #(9*16)] + add sp, sp, #(10*16) + + ret diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_16_32_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_kmul_16_32.S similarity index 97% rename from third_party/s2n-bignum/arm/fastmul/bignum_kmul_16_32_neon.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_kmul_16_32.S index 70a8311fcb5..b53104bfe6e 100644 --- a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_16_32_neon.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_kmul_16_32.S @@ -5,7 +5,7 @@ // Multiply z := x * y // Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32] // -// extern void bignum_kmul_16_32_neon +// extern void bignum_kmul_16_32 // (uint64_t z[static 32], uint64_t x[static 16], uint64_t y[static 16], // uint64_t t[static 32]) // @@ -16,8 +16,8 @@ // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_16_32_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_16_32_neon) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_16_32) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_16_32) .text .balign 4 @@ -33,7 +33,7 @@ #define s x29 #define m x19 -S2N_BN_SYMBOL(bignum_kmul_16_32_neon): +S2N_BN_SYMBOL(bignum_kmul_16_32): // Save registers, including return address @@ -53,7 +53,7 @@ S2N_BN_SYMBOL(bignum_kmul_16_32_neon): // Compute L = x_lo * y_lo in bottom half of buffer (size 8 x 8 -> 16) - bl bignum_kmul_16_32_neon_local_mul_8_16 + bl bignum_kmul_16_32_local_mul_8_16 // Compute absolute difference [t..] = |x_lo - x_hi| // and the sign s = sgn(x_lo - x_hi) as a bitmask (all 1s for negative) @@ -102,7 +102,7 @@ S2N_BN_SYMBOL(bignum_kmul_16_32_neon): add x0, z, #128 add x1, x, #64 add x2, y, #64 - bl bignum_kmul_16_32_neon_local_mul_8_16 + bl bignum_kmul_16_32_local_mul_8_16 // Compute the other absolute difference [t+8..] = |y_hi - y_lo| // Collect the combined product sign bitmask (all 1s for negative) in s @@ -199,7 +199,7 @@ S2N_BN_SYMBOL(bignum_kmul_16_32_neon): add x0, t, #128 mov x1, t add x2, t, #64 - bl bignum_kmul_16_32_neon_local_mul_8_16 + bl bignum_kmul_16_32_local_mul_8_16 // Add the interlocking H' and L_bot terms, storing in registers x15..x0 // Intercept the carry at the 8 + 16 = 24 position and store it in x. @@ -330,10 +330,10 @@ S2N_BN_SYMBOL(bignum_kmul_16_32_neon): ret // ---------------------------------------------------------------------------- -// Local copy of bignum_mul_8_16_neon without the scratch register save/restore +// Local copy of bignum_mul_8_16 without the scratch register save/restore // ---------------------------------------------------------------------------- -bignum_kmul_16_32_neon_local_mul_8_16: +bignum_kmul_16_32_local_mul_8_16: ldp x3, x4, [x1] ldr q0, [x1] ldp x7, x8, [x2] diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_32_64_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_kmul_32_64.S similarity index 98% rename from third_party/s2n-bignum/arm/fastmul/bignum_kmul_32_64_neon.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_kmul_32_64.S index fc716cbea84..313bbf020da 100644 --- a/third_party/s2n-bignum/arm/fastmul/bignum_kmul_32_64_neon.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_kmul_32_64.S @@ -5,7 +5,7 @@ // Multiply z := x * y // Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96] // -// extern void bignum_kmul_32_64_neon +// extern void bignum_kmul_32_64 // (uint64_t z[static 64], uint64_t x[static 32], uint64_t y[static 32], // uint64_t t[static 96]) // @@ -16,8 +16,8 @@ // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_32_64_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_32_64_neon) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_32_64) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_32_64) .text .balign 4 @@ -31,7 +31,7 @@ #define c x16 -S2N_BN_SYMBOL(bignum_kmul_32_64_neon): +S2N_BN_SYMBOL(bignum_kmul_32_64): // Save extra registers and return address, store parameters safely @@ -49,7 +49,7 @@ S2N_BN_SYMBOL(bignum_kmul_32_64_neon): // Compute L = x_lo * y_lo in bottom half of buffer (size 16 x 16 -> 32) - bl bignum_kmul_32_64_neon_local_kmul_16_32 + bl bignum_kmul_32_64_local_kmul_16_32 // Compute H = x_hi * y_hi in top half of buffer (size 16 x 16 -> 32) @@ -57,7 +57,7 @@ S2N_BN_SYMBOL(bignum_kmul_32_64_neon): add x1, x, #8*K add x2, y, #8*K mov x3, t - bl bignum_kmul_32_64_neon_local_kmul_16_32 + bl bignum_kmul_32_64_local_kmul_16_32 // Compute absolute difference [t..] = |x_lo - x_hi| // and the sign x = sgn(x_lo - x_hi) as a bitmask (all 1s for negative) @@ -350,7 +350,7 @@ S2N_BN_SYMBOL(bignum_kmul_32_64_neon): mov x1, t add x2, t, #8*K add x3, t, #32*K - bl bignum_kmul_32_64_neon_local_kmul_16_32 + bl bignum_kmul_32_64_local_kmul_16_32 // Add the interlocking H' and L_bot terms // Intercept the carry at the 3k position and store it in x. @@ -644,11 +644,11 @@ S2N_BN_SYMBOL(bignum_kmul_32_64_neon): ldp x19, x20, [sp], #16 ret -// Local copy of bignum_kmul_16_32_neon, identical to main one except that it +// Local copy of bignum_kmul_16_32, identical to main one except that it // only preserves the key registers we need to be stable in the main code. -// This includes in turn a copy of bignum_mul_8_16_neon. +// This includes in turn a copy of bignum_mul_8_16. -bignum_kmul_32_64_neon_local_kmul_16_32: +bignum_kmul_32_64_local_kmul_16_32: stp x19, x20, [sp, -16]! stp x21, x22, [sp, -16]! stp x23, x30, [sp, -16]! @@ -656,7 +656,7 @@ bignum_kmul_32_64_neon_local_kmul_16_32: mov x26, x1 mov x27, x2 mov x28, x3 - bl bignum_kmul_32_64_neon_local_mul_8_16 + bl bignum_kmul_32_64_local_mul_8_16 ldp x10, x11, [x26] ldp x8, x9, [x26, #64] subs x10, x10, x8 @@ -698,7 +698,7 @@ bignum_kmul_32_64_neon_local_kmul_16_32: add x0, x25, #0x80 add x1, x26, #0x40 add x2, x27, #0x40 - bl bignum_kmul_32_64_neon_local_mul_8_16 + bl bignum_kmul_32_64_local_mul_8_16 ldp x10, x11, [x27] ldp x8, x9, [x27, #64] subs x10, x8, x10 @@ -777,7 +777,7 @@ bignum_kmul_32_64_neon_local_kmul_16_32: add x0, x28, #0x80 mov x1, x28 add x2, x28, #0x40 - bl bignum_kmul_32_64_neon_local_mul_8_16 + bl bignum_kmul_32_64_local_mul_8_16 ldp x0, x1, [x25] ldp x16, x17, [x25, #128] adds x0, x0, x16 @@ -883,7 +883,7 @@ bignum_kmul_32_64_neon_local_kmul_16_32: ldp x19, x20, [sp], #16 ret -bignum_kmul_32_64_neon_local_mul_8_16: +bignum_kmul_32_64_local_mul_8_16: ldp x3, x4, [x1] ldr q0, [x1] ldp x7, x8, [x2] diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_16_32.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_ksqr_16_32.S similarity index 75% rename from third_party/s2n-bignum/arm/fastmul/bignum_ksqr_16_32.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_ksqr_16_32.S index 7be2ac6c455..14873d9ef7c 100644 --- a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_16_32.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_ksqr_16_32.S @@ -263,156 +263,245 @@ S2N_BN_SYMBOL(bignum_ksqr_16_32): // ----------------------------------------------------------------------------- bignum_ksqr_16_32_local_sqr_8_16: +// Load registers. ldp x2, x3, [x1] +ldr q20, [x1] ldp x4, x5, [x1, #16] +ldr q21, [x1, #16] ldp x6, x7, [x1, #32] +ldr q22, [x1, #32] ldp x8, x9, [x1, #48] +ldr q23, [x1, #48] +movi v30.2d, #0xffffffff + mul x17, x2, x4 mul x14, x3, x5 + +// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8 +// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3) +ext v1.16b, v20.16b, v20.16b, #8 umulh x20, x2, x4 +shrn v2.2s, v20.2d, #32 subs x21, x2, x3 - cneg x21, x21, cc - csetm x11, cc +zip1 v0.2s, v20.2s, v1.2s + cneg x21, x21, cc // cc = lo, ul, last +umull v5.2d, v2.2s, v2.2s + csetm x11, cc // cc = lo, ul, last +umull v6.2d, v2.2s, v0.2s subs x12, x5, x4 - cneg x12, x12, cc +umull v3.2d, v0.2s, v0.2s + cneg x12, x12, cc // cc = lo, ul, last +mov v1.16b, v6.16b mul x13, x21, x12 +usra v1.2d, v3.2d, #32 umulh x12, x21, x12 - cinv x11, x11, cc +and v4.16b, v1.16b, v30.16b + cinv x11, x11, cc // cc = lo, ul, last +add v4.2d, v4.2d, v6.2d eor x13, x13, x11 +usra v5.2d, v4.2d, #32 eor x12, x12, x11 +sli v3.2d, v4.2d, #32 adds x19, x17, x20 +usra v5.2d, v1.2d, #32 adc x20, x20, xzr + // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5) + ext v1.16b, v21.16b, v21.16b, #8 umulh x21, x3, x5 + shrn v2.2s, v21.2d, #32 adds x19, x19, x14 + zip1 v0.2s, v21.2s, v1.2s adcs x20, x20, x21 adc x21, x21, xzr adds x20, x20, x14 adc x21, x21, xzr cmn x11, #0x1 adcs x19, x19, x13 +mov x13, v3.d[1] // mul x13, x3, x3 adcs x20, x20, x12 +mov x14, v5.d[1] // umulh x14, x3, x3 adc x21, x21, x11 +mov x12, v3.d[0] // mul x12, x2, x2 adds x17, x17, x17 +mov x11, v5.d[0] // umulh x11, x2, x2 adcs x19, x19, x19 + umull v5.2d, v2.2s, v2.2s adcs x20, x20, x20 + umull v6.2d, v2.2s, v0.2s adcs x21, x21, x21 + umull v3.2d, v0.2s, v0.2s adc x10, xzr, xzr - mul x12, x2, x2 - mul x13, x3, x3 + mov v1.16b, v6.16b + mul x15, x2, x3 - umulh x11, x2, x2 - umulh x14, x3, x3 + usra v1.2d, v3.2d, #32 umulh x16, x2, x3 + and v4.16b, v1.16b, v30.16b adds x11, x11, x15 + add v4.2d, v4.2d, v6.2d adcs x13, x13, x16 + usra v5.2d, v4.2d, #32 adc x14, x14, xzr + sli v3.2d, v4.2d, #32 adds x11, x11, x15 + usra v5.2d, v1.2d, #32 adcs x13, x13, x16 adc x14, x14, xzr stp x12, x11, [x0] + mov x11, v5.d[0] // umulh x11, x4, x4 adds x17, x17, x13 + mov x13, v3.d[1] // mul x13, x5, x5 adcs x19, x19, x14 + mov x14, v5.d[1] // umulh x14, x5, x5 adcs x20, x20, xzr + mov x12, v3.d[0] // mul x12, x4, x4 adcs x21, x21, xzr +// NEON: prepare muls in the upper half +ext v1.16b, v22.16b, v22.16b, #8 adc x10, x10, xzr +shrn v2.2s, v22.2d, #32 stp x17, x19, [x0, #16] - mul x12, x4, x4 - mul x13, x5, x5 +zip1 v0.2s, v22.2s, v1.2s mul x15, x4, x5 - umulh x11, x4, x4 - umulh x14, x5, x5 +umull v5.2d, v2.2s, v2.2s umulh x16, x4, x5 +umull v6.2d, v2.2s, v0.2s adds x11, x11, x15 +umull v3.2d, v0.2s, v0.2s adcs x13, x13, x16 +mov v1.16b, v6.16b adc x14, x14, xzr +usra v1.2d, v3.2d, #32 adds x11, x11, x15 +and v4.16b, v1.16b, v30.16b adcs x13, x13, x16 +add v4.2d, v4.2d, v6.2d adc x14, x14, xzr +usra v5.2d, v4.2d, #32 adds x12, x12, x20 +sli v3.2d, v4.2d, #32 adcs x11, x11, x21 +usra v5.2d, v1.2d, #32 stp x12, x11, [x0, #32] + // NEON: prepare muls in the upper half + ext v1.16b, v23.16b, v23.16b, #8 adcs x13, x13, x10 + shrn v2.2s, v23.2d, #32 adc x14, x14, xzr + zip1 v0.2s, v23.2s, v1.2s stp x13, x14, [x0, #48] + +// Scalar: square the upper half with a slight variant of the previous block mul x17, x6, x8 + umull v16.2d, v2.2s, v2.2s mul x14, x7, x9 + umull v6.2d, v2.2s, v0.2s umulh x20, x6, x8 + umull v18.2d, v0.2s, v0.2s subs x21, x6, x7 - cneg x21, x21, cc - csetm x11, cc + cneg x21, x21, cc // cc = lo, ul, last + mov v1.16b, v6.16b + csetm x11, cc // cc = lo, ul, last subs x12, x9, x8 - cneg x12, x12, cc + cneg x12, x12, cc // cc = lo, ul, last + usra v1.2d, v18.2d, #32 mul x13, x21, x12 + and v4.16b, v1.16b, v30.16b umulh x12, x21, x12 - cinv x11, x11, cc + add v4.2d, v4.2d, v6.2d + cinv x11, x11, cc // cc = lo, ul, last eor x13, x13, x11 eor x12, x12, x11 + usra v16.2d, v4.2d, #32 adds x19, x17, x20 adc x20, x20, xzr + sli v18.2d, v4.2d, #32 umulh x21, x7, x9 adds x19, x19, x14 adcs x20, x20, x21 adc x21, x21, xzr adds x20, x20, x14 +mov x14, v5.d[1] adc x21, x21, xzr cmn x11, #0x1 adcs x19, x19, x13 +mov x13, v3.d[1] adcs x20, x20, x12 +mov x12, v3.d[0] adc x21, x21, x11 +mov x11, v5.d[0] adds x17, x17, x17 adcs x19, x19, x19 + usra v16.2d, v1.2d, #32 adcs x20, x20, x20 adcs x21, x21, x21 adc x10, xzr, xzr - mul x12, x6, x6 - mul x13, x7, x7 +// NEON: two mul+umulhs for the next stage +uzp2 v17.4s, v21.4s, v23.4s mul x15, x6, x7 - umulh x11, x6, x6 - umulh x14, x7, x7 +xtn v4.2s, v23.2d umulh x16, x6, x7 + mov x22, v16.d[0] adds x11, x11, x15 adcs x13, x13, x16 +xtn v5.2s, v21.2d adc x14, x14, xzr adds x11, x11, x15 +rev64 v1.4s, v21.4s adcs x13, x13, x16 adc x14, x14, xzr stp x12, x11, [x0, #64] adds x17, x17, x13 + mov x13, v18.d[1] adcs x19, x19, x14 + mov x14, v16.d[1] adcs x20, x20, xzr + mov x12, v18.d[0] adcs x21, x21, xzr adc x10, x10, xzr +umull v6.2d, v4.2s, v5.2s stp x17, x19, [x0, #80] - mul x12, x8, x8 - mul x13, x9, x9 +umull v7.2d, v4.2s, v17.2s mul x15, x8, x9 - umulh x11, x8, x8 - umulh x14, x9, x9 +uzp2 v16.4s, v23.4s, v23.4s umulh x16, x8, x9 - adds x11, x11, x15 +mul v0.4s, v1.4s, v23.4s + adds x11, x22, x15 adcs x13, x13, x16 +usra v7.2d, v6.2d, #32 adc x14, x14, xzr adds x11, x11, x15 +umull v1.2d, v16.2s, v17.2s adcs x13, x13, x16 adc x14, x14, xzr +uaddlp v0.2d, v0.4s adds x12, x12, x20 adcs x11, x11, x21 +and v2.16b, v7.16b, v30.16b +umlal v2.2d, v16.2s, v5.2s +shl v0.2d, v0.2d, #32 +usra v1.2d, v7.2d, #32 +umlal v0.2d, v4.2s, v5.2s +mov x16, v0.d[1] +mov x15, v0.d[0] +usra v1.2d, v2.2d, #32 +mov x20, v1.d[0] +mov x21, v1.d[1] stp x12, x11, [x0, #96] adcs x13, x13, x10 adc x14, x14, xzr stp x13, x14, [x0, #112] + +// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0] + mul x10, x2, x6 mul x14, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 umulh x17, x2, x6 adds x14, x14, x17 umulh x17, x3, x7 adcs x15, x15, x17 - umulh x17, x4, x8 - adcs x16, x16, x17 - umulh x17, x5, x9 - adc x17, x17, xzr + adcs x16, x16, x20 + adc x17, x21, xzr adds x11, x14, x10 adcs x14, x15, x14 adcs x15, x16, x15 @@ -425,13 +514,13 @@ bignum_ksqr_16_32_local_sqr_8_16: adcs x16, xzr, x16 adc x17, xzr, x17 subs x22, x4, x5 - cneg x22, x22, cc - csetm x19, cc + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last subs x20, x9, x8 - cneg x20, x20, cc + cneg x20, x20, cc // cc = lo, ul, last mul x21, x22, x20 umulh x20, x22, x20 - cinv x19, x19, cc + cinv x19, x19, cc // cc = lo, ul, last cmn x19, #0x1 eor x21, x21, x19 adcs x15, x15, x21 @@ -439,13 +528,13 @@ bignum_ksqr_16_32_local_sqr_8_16: adcs x16, x16, x20 adc x17, x17, x19 subs x22, x2, x3 - cneg x22, x22, cc - csetm x19, cc + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last subs x20, x7, x6 - cneg x20, x20, cc + cneg x20, x20, cc // cc = lo, ul, last mul x21, x22, x20 umulh x20, x22, x20 - cinv x19, x19, cc + cinv x19, x19, cc // cc = lo, ul, last cmn x19, #0x1 eor x21, x21, x19 adcs x11, x11, x21 @@ -457,13 +546,13 @@ bignum_ksqr_16_32_local_sqr_8_16: adcs x16, x16, x19 adc x17, x17, x19 subs x22, x3, x5 - cneg x22, x22, cc - csetm x19, cc + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last subs x20, x9, x7 - cneg x20, x20, cc + cneg x20, x20, cc // cc = lo, ul, last mul x21, x22, x20 umulh x20, x22, x20 - cinv x19, x19, cc + cinv x19, x19, cc // cc = lo, ul, last cmn x19, #0x1 eor x21, x21, x19 adcs x14, x14, x21 @@ -472,13 +561,13 @@ bignum_ksqr_16_32_local_sqr_8_16: adcs x16, x16, x19 adc x17, x17, x19 subs x22, x2, x4 - cneg x22, x22, cc - csetm x19, cc + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last subs x20, x8, x6 - cneg x20, x20, cc + cneg x20, x20, cc // cc = lo, ul, last mul x21, x22, x20 umulh x20, x22, x20 - cinv x19, x19, cc + cinv x19, x19, cc // cc = lo, ul, last cmn x19, #0x1 eor x21, x21, x19 adcs x12, x12, x21 @@ -489,13 +578,13 @@ bignum_ksqr_16_32_local_sqr_8_16: adcs x16, x16, x19 adc x17, x17, x19 subs x22, x2, x5 - cneg x22, x22, cc - csetm x19, cc + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last subs x20, x9, x6 - cneg x20, x20, cc + cneg x20, x20, cc // cc = lo, ul, last mul x21, x22, x20 umulh x20, x22, x20 - cinv x19, x19, cc + cinv x19, x19, cc // cc = lo, ul, last cmn x19, #0x1 eor x21, x21, x19 adcs x13, x13, x21 @@ -505,13 +594,13 @@ bignum_ksqr_16_32_local_sqr_8_16: adcs x16, x16, x19 adc x17, x17, x19 subs x22, x3, x4 - cneg x22, x22, cc - csetm x19, cc + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last subs x20, x8, x7 - cneg x20, x20, cc + cneg x20, x20, cc // cc = lo, ul, last mul x21, x22, x20 umulh x20, x22, x20 - cinv x19, x19, cc + cinv x19, x19, cc // cc = lo, ul, last cmn x19, #0x1 eor x21, x21, x19 adcs x13, x13, x21 @@ -529,30 +618,39 @@ bignum_ksqr_16_32_local_sqr_8_16: adcs x16, x16, x16 adcs x17, x17, x17 adc x19, xzr, xzr + +// Add it back to the buffer + ldp x2, x3, [x0, #32] adds x10, x10, x2 adcs x11, x11, x3 stp x10, x11, [x0, #32] + ldp x2, x3, [x0, #48] adcs x12, x12, x2 adcs x13, x13, x3 stp x12, x13, [x0, #48] + ldp x2, x3, [x0, #64] adcs x14, x14, x2 adcs x15, x15, x3 stp x14, x15, [x0, #64] + ldp x2, x3, [x0, #80] adcs x16, x16, x2 adcs x17, x17, x3 stp x16, x17, [x0, #80] + ldp x2, x3, [x0, #96] adcs x2, x2, x19 adcs x3, x3, xzr stp x2, x3, [x0, #96] + ldp x2, x3, [x0, #112] adcs x2, x2, xzr adc x3, x3, xzr stp x2, x3, [x0, #112] + ret #if defined(__linux__) && defined(__ELF__) diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_32_64.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_ksqr_32_64.S similarity index 84% rename from third_party/s2n-bignum/arm/fastmul/bignum_ksqr_32_64.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_ksqr_32_64.S index 659e00a7919..c54e673c672 100644 --- a/third_party/s2n-bignum/arm/fastmul/bignum_ksqr_32_64.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_ksqr_32_64.S @@ -680,156 +680,245 @@ bignum_ksqr_32_64_local_ksqr_16_32: ret bignum_ksqr_32_64_local_sqr_8_16: +// Load registers. ldp x2, x3, [x1] +ldr q20, [x1] ldp x4, x5, [x1, #16] +ldr q21, [x1, #16] ldp x6, x7, [x1, #32] +ldr q22, [x1, #32] ldp x8, x9, [x1, #48] +ldr q23, [x1, #48] +movi v30.2d, #0xffffffff + mul x17, x2, x4 mul x14, x3, x5 + +// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8 +// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3) +ext v1.16b, v20.16b, v20.16b, #8 umulh x20, x2, x4 +shrn v2.2s, v20.2d, #32 subs x21, x2, x3 - cneg x21, x21, cc - csetm x11, cc +zip1 v0.2s, v20.2s, v1.2s + cneg x21, x21, cc // cc = lo, ul, last +umull v5.2d, v2.2s, v2.2s + csetm x11, cc // cc = lo, ul, last +umull v6.2d, v2.2s, v0.2s subs x12, x5, x4 - cneg x12, x12, cc +umull v3.2d, v0.2s, v0.2s + cneg x12, x12, cc // cc = lo, ul, last +mov v1.16b, v6.16b mul x13, x21, x12 +usra v1.2d, v3.2d, #32 umulh x12, x21, x12 - cinv x11, x11, cc +and v4.16b, v1.16b, v30.16b + cinv x11, x11, cc // cc = lo, ul, last +add v4.2d, v4.2d, v6.2d eor x13, x13, x11 +usra v5.2d, v4.2d, #32 eor x12, x12, x11 +sli v3.2d, v4.2d, #32 adds x19, x17, x20 +usra v5.2d, v1.2d, #32 adc x20, x20, xzr + // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5) + ext v1.16b, v21.16b, v21.16b, #8 umulh x21, x3, x5 + shrn v2.2s, v21.2d, #32 adds x19, x19, x14 + zip1 v0.2s, v21.2s, v1.2s adcs x20, x20, x21 adc x21, x21, xzr adds x20, x20, x14 adc x21, x21, xzr cmn x11, #0x1 adcs x19, x19, x13 +mov x13, v3.d[1] // mul x13, x3, x3 adcs x20, x20, x12 +mov x14, v5.d[1] // umulh x14, x3, x3 adc x21, x21, x11 +mov x12, v3.d[0] // mul x12, x2, x2 adds x17, x17, x17 +mov x11, v5.d[0] // umulh x11, x2, x2 adcs x19, x19, x19 + umull v5.2d, v2.2s, v2.2s adcs x20, x20, x20 + umull v6.2d, v2.2s, v0.2s adcs x21, x21, x21 + umull v3.2d, v0.2s, v0.2s adc x10, xzr, xzr - mul x12, x2, x2 - mul x13, x3, x3 + mov v1.16b, v6.16b + mul x15, x2, x3 - umulh x11, x2, x2 - umulh x14, x3, x3 + usra v1.2d, v3.2d, #32 umulh x16, x2, x3 + and v4.16b, v1.16b, v30.16b adds x11, x11, x15 + add v4.2d, v4.2d, v6.2d adcs x13, x13, x16 + usra v5.2d, v4.2d, #32 adc x14, x14, xzr + sli v3.2d, v4.2d, #32 adds x11, x11, x15 + usra v5.2d, v1.2d, #32 adcs x13, x13, x16 adc x14, x14, xzr stp x12, x11, [x0] + mov x11, v5.d[0] // umulh x11, x4, x4 adds x17, x17, x13 + mov x13, v3.d[1] // mul x13, x5, x5 adcs x19, x19, x14 + mov x14, v5.d[1] // umulh x14, x5, x5 adcs x20, x20, xzr + mov x12, v3.d[0] // mul x12, x4, x4 adcs x21, x21, xzr +// NEON: prepare muls in the upper half +ext v1.16b, v22.16b, v22.16b, #8 adc x10, x10, xzr +shrn v2.2s, v22.2d, #32 stp x17, x19, [x0, #16] - mul x12, x4, x4 - mul x13, x5, x5 +zip1 v0.2s, v22.2s, v1.2s mul x15, x4, x5 - umulh x11, x4, x4 - umulh x14, x5, x5 +umull v5.2d, v2.2s, v2.2s umulh x16, x4, x5 +umull v6.2d, v2.2s, v0.2s adds x11, x11, x15 +umull v3.2d, v0.2s, v0.2s adcs x13, x13, x16 +mov v1.16b, v6.16b adc x14, x14, xzr +usra v1.2d, v3.2d, #32 adds x11, x11, x15 +and v4.16b, v1.16b, v30.16b adcs x13, x13, x16 +add v4.2d, v4.2d, v6.2d adc x14, x14, xzr +usra v5.2d, v4.2d, #32 adds x12, x12, x20 +sli v3.2d, v4.2d, #32 adcs x11, x11, x21 +usra v5.2d, v1.2d, #32 stp x12, x11, [x0, #32] + // NEON: prepare muls in the upper half + ext v1.16b, v23.16b, v23.16b, #8 adcs x13, x13, x10 + shrn v2.2s, v23.2d, #32 adc x14, x14, xzr + zip1 v0.2s, v23.2s, v1.2s stp x13, x14, [x0, #48] + +// Scalar: square the upper half with a slight variant of the previous block mul x17, x6, x8 + umull v16.2d, v2.2s, v2.2s mul x14, x7, x9 + umull v6.2d, v2.2s, v0.2s umulh x20, x6, x8 + umull v18.2d, v0.2s, v0.2s subs x21, x6, x7 - cneg x21, x21, cc - csetm x11, cc + cneg x21, x21, cc // cc = lo, ul, last + mov v1.16b, v6.16b + csetm x11, cc // cc = lo, ul, last subs x12, x9, x8 - cneg x12, x12, cc + cneg x12, x12, cc // cc = lo, ul, last + usra v1.2d, v18.2d, #32 mul x13, x21, x12 + and v4.16b, v1.16b, v30.16b umulh x12, x21, x12 - cinv x11, x11, cc + add v4.2d, v4.2d, v6.2d + cinv x11, x11, cc // cc = lo, ul, last eor x13, x13, x11 eor x12, x12, x11 + usra v16.2d, v4.2d, #32 adds x19, x17, x20 adc x20, x20, xzr + sli v18.2d, v4.2d, #32 umulh x21, x7, x9 adds x19, x19, x14 adcs x20, x20, x21 adc x21, x21, xzr adds x20, x20, x14 +mov x14, v5.d[1] adc x21, x21, xzr cmn x11, #0x1 adcs x19, x19, x13 +mov x13, v3.d[1] adcs x20, x20, x12 +mov x12, v3.d[0] adc x21, x21, x11 +mov x11, v5.d[0] adds x17, x17, x17 adcs x19, x19, x19 + usra v16.2d, v1.2d, #32 adcs x20, x20, x20 adcs x21, x21, x21 adc x10, xzr, xzr - mul x12, x6, x6 - mul x13, x7, x7 +// NEON: two mul+umulhs for the next stage +uzp2 v17.4s, v21.4s, v23.4s mul x15, x6, x7 - umulh x11, x6, x6 - umulh x14, x7, x7 +xtn v4.2s, v23.2d umulh x16, x6, x7 + mov x22, v16.d[0] adds x11, x11, x15 adcs x13, x13, x16 +xtn v5.2s, v21.2d adc x14, x14, xzr adds x11, x11, x15 +rev64 v1.4s, v21.4s adcs x13, x13, x16 adc x14, x14, xzr stp x12, x11, [x0, #64] adds x17, x17, x13 + mov x13, v18.d[1] adcs x19, x19, x14 + mov x14, v16.d[1] adcs x20, x20, xzr + mov x12, v18.d[0] adcs x21, x21, xzr adc x10, x10, xzr +umull v6.2d, v4.2s, v5.2s stp x17, x19, [x0, #80] - mul x12, x8, x8 - mul x13, x9, x9 +umull v7.2d, v4.2s, v17.2s mul x15, x8, x9 - umulh x11, x8, x8 - umulh x14, x9, x9 +uzp2 v16.4s, v23.4s, v23.4s umulh x16, x8, x9 - adds x11, x11, x15 +mul v0.4s, v1.4s, v23.4s + adds x11, x22, x15 adcs x13, x13, x16 +usra v7.2d, v6.2d, #32 adc x14, x14, xzr adds x11, x11, x15 +umull v1.2d, v16.2s, v17.2s adcs x13, x13, x16 adc x14, x14, xzr +uaddlp v0.2d, v0.4s adds x12, x12, x20 adcs x11, x11, x21 +and v2.16b, v7.16b, v30.16b +umlal v2.2d, v16.2s, v5.2s +shl v0.2d, v0.2d, #32 +usra v1.2d, v7.2d, #32 +umlal v0.2d, v4.2s, v5.2s +mov x16, v0.d[1] +mov x15, v0.d[0] +usra v1.2d, v2.2d, #32 +mov x20, v1.d[0] +mov x21, v1.d[1] stp x12, x11, [x0, #96] adcs x13, x13, x10 adc x14, x14, xzr stp x13, x14, [x0, #112] + +// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0] + mul x10, x2, x6 mul x14, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 umulh x17, x2, x6 adds x14, x14, x17 umulh x17, x3, x7 adcs x15, x15, x17 - umulh x17, x4, x8 - adcs x16, x16, x17 - umulh x17, x5, x9 - adc x17, x17, xzr + adcs x16, x16, x20 + adc x17, x21, xzr adds x11, x14, x10 adcs x14, x15, x14 adcs x15, x16, x15 @@ -842,13 +931,13 @@ bignum_ksqr_32_64_local_sqr_8_16: adcs x16, xzr, x16 adc x17, xzr, x17 subs x22, x4, x5 - cneg x22, x22, cc - csetm x19, cc + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last subs x20, x9, x8 - cneg x20, x20, cc + cneg x20, x20, cc // cc = lo, ul, last mul x21, x22, x20 umulh x20, x22, x20 - cinv x19, x19, cc + cinv x19, x19, cc // cc = lo, ul, last cmn x19, #0x1 eor x21, x21, x19 adcs x15, x15, x21 @@ -856,13 +945,13 @@ bignum_ksqr_32_64_local_sqr_8_16: adcs x16, x16, x20 adc x17, x17, x19 subs x22, x2, x3 - cneg x22, x22, cc - csetm x19, cc + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last subs x20, x7, x6 - cneg x20, x20, cc + cneg x20, x20, cc // cc = lo, ul, last mul x21, x22, x20 umulh x20, x22, x20 - cinv x19, x19, cc + cinv x19, x19, cc // cc = lo, ul, last cmn x19, #0x1 eor x21, x21, x19 adcs x11, x11, x21 @@ -874,13 +963,13 @@ bignum_ksqr_32_64_local_sqr_8_16: adcs x16, x16, x19 adc x17, x17, x19 subs x22, x3, x5 - cneg x22, x22, cc - csetm x19, cc + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last subs x20, x9, x7 - cneg x20, x20, cc + cneg x20, x20, cc // cc = lo, ul, last mul x21, x22, x20 umulh x20, x22, x20 - cinv x19, x19, cc + cinv x19, x19, cc // cc = lo, ul, last cmn x19, #0x1 eor x21, x21, x19 adcs x14, x14, x21 @@ -889,13 +978,13 @@ bignum_ksqr_32_64_local_sqr_8_16: adcs x16, x16, x19 adc x17, x17, x19 subs x22, x2, x4 - cneg x22, x22, cc - csetm x19, cc + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last subs x20, x8, x6 - cneg x20, x20, cc + cneg x20, x20, cc // cc = lo, ul, last mul x21, x22, x20 umulh x20, x22, x20 - cinv x19, x19, cc + cinv x19, x19, cc // cc = lo, ul, last cmn x19, #0x1 eor x21, x21, x19 adcs x12, x12, x21 @@ -906,13 +995,13 @@ bignum_ksqr_32_64_local_sqr_8_16: adcs x16, x16, x19 adc x17, x17, x19 subs x22, x2, x5 - cneg x22, x22, cc - csetm x19, cc + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last subs x20, x9, x6 - cneg x20, x20, cc + cneg x20, x20, cc // cc = lo, ul, last mul x21, x22, x20 umulh x20, x22, x20 - cinv x19, x19, cc + cinv x19, x19, cc // cc = lo, ul, last cmn x19, #0x1 eor x21, x21, x19 adcs x13, x13, x21 @@ -922,13 +1011,13 @@ bignum_ksqr_32_64_local_sqr_8_16: adcs x16, x16, x19 adc x17, x17, x19 subs x22, x3, x4 - cneg x22, x22, cc - csetm x19, cc + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last subs x20, x8, x7 - cneg x20, x20, cc + cneg x20, x20, cc // cc = lo, ul, last mul x21, x22, x20 umulh x20, x22, x20 - cinv x19, x19, cc + cinv x19, x19, cc // cc = lo, ul, last cmn x19, #0x1 eor x21, x21, x19 adcs x13, x13, x21 @@ -946,30 +1035,39 @@ bignum_ksqr_32_64_local_sqr_8_16: adcs x16, x16, x16 adcs x17, x17, x17 adc x19, xzr, xzr + +// Add it back to the buffer + ldp x2, x3, [x0, #32] adds x10, x10, x2 adcs x11, x11, x3 stp x10, x11, [x0, #32] + ldp x2, x3, [x0, #48] adcs x12, x12, x2 adcs x13, x13, x3 stp x12, x13, [x0, #48] + ldp x2, x3, [x0, #64] adcs x14, x14, x2 adcs x15, x15, x3 stp x14, x15, [x0, #64] + ldp x2, x3, [x0, #80] adcs x16, x16, x2 adcs x17, x17, x3 stp x16, x17, [x0, #80] + ldp x2, x3, [x0, #96] adcs x2, x2, x19 adcs x3, x3, xzr stp x2, x3, [x0, #96] + ldp x2, x3, [x0, #112] adcs x2, x2, xzr adc x3, x3, xzr stp x2, x3, [x0, #112] + ret #if defined(__linux__) && defined(__ELF__) diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_4_8.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_4_8.S new file mode 100644 index 00000000000..11f57583bf5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_4_8.S @@ -0,0 +1,252 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[4], y[4]; output z[8] +// +// extern void bignum_mul_4_8 +// (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 + +#define a0 x3 +#define a0short w3 +#define a1 x4 +#define b0 x5 +#define b0short w5 +#define b1 x6 + +#define u0 x7 +#define u1 x8 +#define u2 x9 +#define u3 x10 +#define u4 x11 +#define u5 x12 +#define u6 x13 +#define u7 x14 + +#define t x15 + +#define sgn x16 +#define ysgn x17 + +// These are aliases to registers used elsewhere including input pointers. +// By the time they are used this does not conflict with other uses. + +#define m0 y +#define m1 ysgn +#define m2 t +#define m3 x +#define u u2 + +S2N_BN_SYMBOL(bignum_mul_4_8): + +// Multiply the low halves using Karatsuba 2x2->4 to get [u3,u2,u1,u0] +// The zeroth multiplication (only) is done via 32-bit breakdowns + + ldp a0, a1, [x] + ldp b0, b1, [y] + + umull u0, a0short, b0short + lsr x17, a0, #32 + umull x15, w17, b0short + lsr x16, b0, #32 + umull u1, w16, w17 + umull x16, a0short, w16 + adds u0, u0, x15, lsl #32 + lsr x15, x15, #32 + adc u1, u1, x15 + adds u0, u0, x16, lsl #32 + lsr x16, x16, #32 + adc u1, u1, x16 + + mul u2, a1, b1 + umulh u3, a1, b1 + + subs a1, a1, a0 + cneg a1, a1, cc + csetm sgn, cc + + adds u2, u2, u1 + adc u3, u3, xzr + + subs a0, b0, b1 + cneg a0, a0, cc + cinv sgn, sgn, cc + + mul t, a1, a0 + umulh a0, a1, a0 + + adds u1, u0, u2 + adcs u2, u2, u3 + adc u3, u3, xzr + + adds xzr, sgn, #1 + eor t, t, sgn + adcs u1, t, u1 + eor a0, a0, sgn + adcs u2, a0, u2 + adc u3, u3, sgn + +// Multiply the high halves using Karatsuba 2x2->4 to get [u7,u6,u5,u4] +// Again, the zeroth multiplication (only) is done via 32-bit breakdowns + + ldp a0, a1, [x, #16] + ldp b0, b1, [y, #16] + + umull u4, a0short, b0short + lsr x17, a0, #32 + umull x15, w17, b0short + lsr x16, b0, #32 + umull u5, w16, w17 + umull x16, a0short, w16 + adds u4, u4, x15, lsl #32 + lsr x15, x15, #32 + adc u5, u5, x15 + adds u4, u4, x16, lsl #32 + lsr x16, x16, #32 + adc u5, u5, x16 + + mul u6, a1, b1 + umulh u7, a1, b1 + + subs a1, a1, a0 + cneg a1, a1, cc + csetm sgn, cc + + adds u6, u6, u5 + adc u7, u7, xzr + + subs a0, b0, b1 + cneg a0, a0, cc + cinv sgn, sgn, cc + + mul t, a1, a0 + umulh a0, a1, a0 + + adds u5, u4, u6 + adcs u6, u6, u7 + adc u7, u7, xzr + + adds xzr, sgn, #1 + eor t, t, sgn + adcs u5, t, u5 + eor a0, a0, sgn + adcs u6, a0, u6 + adc u7, u7, sgn + +// Compute sgn,[a1,a0] = x_hi - x_lo +// and ysgn,[b1,b0] = y_lo - y_hi +// sign-magnitude differences + + ldp a0, a1, [x, #16] + ldp t, sgn, [x] + subs a0, a0, t + sbcs a1, a1, sgn + csetm sgn, cc + + ldp t, ysgn, [y] + subs b0, t, b0 + sbcs b1, ysgn, b1 + csetm ysgn, cc + + eor a0, a0, sgn + subs a0, a0, sgn + eor a1, a1, sgn + sbc a1, a1, sgn + + eor b0, b0, ysgn + subs b0, b0, ysgn + eor b1, b1, ysgn + sbc b1, b1, ysgn + +// Save the correct sign for the sub-product + + eor sgn, ysgn, sgn + +// Add H' = H + L_top, still in [u7,u6,u5,u4] + + adds u4, u4, u2 + adcs u5, u5, u3 + adcs u6, u6, xzr + adc u7, u7, xzr + +// Now compute the mid-product as [m3,m2,m1,m0] + + mul m0, a0, b0 + umulh m1, a0, b0 + mul m2, a1, b1 + umulh m3, a1, b1 + + subs a1, a1, a0 + cneg a1, a1, cc + csetm u, cc + + adds m2, m2, m1 + adc m3, m3, xzr + + subs b1, b0, b1 + cneg b1, b1, cc + cinv u, u, cc + + mul b0, a1, b1 + umulh b1, a1, b1 + + adds m1, m0, m2 + adcs m2, m2, m3 + adc m3, m3, xzr + + adds xzr, u, #1 + eor b0, b0, u + adcs m1, b0, m1 + eor b1, b1, u + adcs m2, b1, m2 + adc m3, m3, u + +// Accumulate the positive mid-terms as [u7,u6,u5,u4,u3,u2] + + adds u2, u4, u0 + adcs u3, u5, u1 + adcs u4, u6, u4 + adcs u5, u7, u5 + adcs u6, u6, xzr + adc u7, u7, xzr + +// Add in the sign-adjusted complex term + + adds xzr, sgn, #1 + eor m0, m0, sgn + adcs u2, m0, u2 + eor m1, m1, sgn + adcs u3, m1, u3 + eor m2, m2, sgn + adcs u4, m2, u4 + eor m3, m3, sgn + adcs u5, m3, u5 + adcs u6, u6, sgn + adc u7, u7, sgn + +// Store back the result + + stp u0, u1, [z] + stp u2, u3, [z, #16] + stp u4, u5, [z, #32] + stp u6, u7, [z, #48] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_4_8_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_4_8_alt.S new file mode 100644 index 00000000000..b082b8011dd --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_4_8_alt.S @@ -0,0 +1,147 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[4], y[4]; output z[8] +// +// extern void bignum_mul_4_8_alt +// (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8_alt) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 +#define b0 x7 +#define b1 x8 +#define b2 x9 +#define b3 x10 + +#define t x11 + +#define u0 x12 +#define u1 x13 +#define u2 x14 +#define u3 x15 +#define u4 x16 + +// These alias to the input arguments when no longer needed + +#define u5 a0 +#define u6 a1 +#define u7 a2 + +S2N_BN_SYMBOL(bignum_mul_4_8_alt): + +// Load operands and set up row 0 = [u4;...;u0] = a0 * [b3;...;b0] + + ldp a0, a1, [x] + ldp b0, b1, [y] + + mul u0, a0, b0 + umulh u1, a0, b0 + mul t, a0, b1 + umulh u2, a0, b1 + adds u1, u1, t + + ldp b2, b3, [y, #16] + + mul t, a0, b2 + umulh u3, a0, b2 + adcs u2, u2, t + + mul t, a0, b3 + umulh u4, a0, b3 + adcs u3, u3, t + adc u4, u4, xzr + + ldp a2, a3, [x, #16] + +// Row 1 = [u5;...;u0] = [a1;a0] * [b3;...;b0] + + mul t, a1, b0 + adds u1, u1, t + mul t, a1, b1 + adcs u2, u2, t + mul t, a1, b2 + adcs u3, u3, t + mul t, a1, b3 + adcs u4, u4, t + umulh u5, a1, b3 + adc u5, u5, xzr + + umulh t, a1, b0 + adds u2, u2, t + umulh t, a1, b1 + adcs u3, u3, t + umulh t, a1, b2 + adcs u4, u4, t + adc u5, u5, xzr + +// Row 2 = [u6;...;u0] = [a2;a1;a0] * [b3;...;b0] + + mul t, a2, b0 + adds u2, u2, t + mul t, a2, b1 + adcs u3, u3, t + mul t, a2, b2 + adcs u4, u4, t + mul t, a2, b3 + adcs u5, u5, t + umulh u6, a2, b3 + adc u6, u6, xzr + + umulh t, a2, b0 + adds u3, u3, t + umulh t, a2, b1 + adcs u4, u4, t + umulh t, a2, b2 + adcs u5, u5, t + adc u6, u6, xzr + +// Row 3 = [u7;...;u0] = [a3;...a0] * [b3;...;b0] + + mul t, a3, b0 + adds u3, u3, t + mul t, a3, b1 + adcs u4, u4, t + mul t, a3, b2 + adcs u5, u5, t + mul t, a3, b3 + adcs u6, u6, t + umulh u7, a3, b3 + adc u7, u7, xzr + + umulh t, a3, b0 + adds u4, u4, t + umulh t, a3, b1 + adcs u5, u5, t + umulh t, a3, b2 + adcs u6, u6, t + adc u7, u7, xzr + +// Store back final result [a3;...a0] * [b3;...;b0] = a * b + + stp u0, u1, [z] + stp u2, u3, [z, #16] + stp u4, u5, [z, #32] + stp u6, u7, [z, #48] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_6_12.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_6_12.S new file mode 100644 index 00000000000..b32b19102e6 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_6_12.S @@ -0,0 +1,278 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[6], y[6]; output z[12] +// +// extern void bignum_mul_6_12 +// (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Macro computing [c,b,a] := [b,a] + (x - y) * (w - z), adding with carry +// to the [b,a] components but leaving CF aligned with the c term, which is +// a sign bitmask for (x - y) * (w - z). Continued add-with-carry operations +// with [c,...,c] will continue the carry chain correctly starting from +// the c position if desired to add to a longer term of the form [...,b,a]. +// +// c,h,l,t should all be different and t,h should not overlap w,z. +// --------------------------------------------------------------------------- + +.macro muldiffnadd b,a, c,h,l,t, x,y, w,z + subs \t, \x, \y + cneg \t, \t, cc + csetm \c, cc + subs \h, \w, \z + cneg \h, \h, cc + mul \l, \t, \h + umulh \h, \t, \h + cinv \c, \c, cc + adds xzr, \c, #1 + eor \l, \l, \c + adcs \a, \a, \l + eor \h, \h, \c + adcs \b, \b, \h +.endm + +#define z x0 +#define x x1 +#define y x2 + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define b0 x6 +#define b1 x7 +#define b2 x8 +#define l0 x9 +#define l1 x10 +#define l2 x11 +#define h0 x12 +#define h1 x13 +#define h2 x14 + +#define s1 x15 +#define s2 x16 +#define s3 x17 +#define s4 x19 +#define s5 x9 + +#define c x10 +#define h x11 +#define l x12 +#define t x13 + +#define s0 x20 + +#define u0 x3 +#define u1 x4 +#define u2 x5 +#define u3 x6 +#define u4 x7 +#define u5 x8 + +// These alias c,h,l but it doesn't matter + +#define u6 x10 +#define u7 x11 +#define u8 x12 + +// We recycle the input pointers near the end + +#define s x1 +#define d x2 + +// --------------------------------------------------------------------------- +// Core 3x3->6 ADK multiplication macro +// Does [s5,s4,s3,s2,s1,s0] = [a2,a1,a0] * [b2,b1,b0] +// +// If the input parameter is 1, it also adds in [z+24,z+32,z+40] +// existing contents; if the parameter is 0 it just does the pure multiply +// --------------------------------------------------------------------------- + +.macro mul3 afl + mul s0, a0, b0 + mul l1, a1, b1 + mul l2, a2, b2 + umulh h0, a0, b0 + umulh h1, a1, b1 + umulh h2, a2, b2 + + adds h0, h0, l1 + adcs h1, h1, l2 + adc h2, h2, xzr + + adds s1, h0, s0 + adcs s2, h1, h0 + adcs s3, h2, h1 + adc s4, h2, xzr + + adds s2, s2, s0 + adcs s3, s3, h0 + adcs s4, s4, h1 + adc s5, h2, xzr + +// Optionally add the existing z contents + +.rep \afl + ldr l, [z,#24] + adds s0, s0, l + ldp l, h, [z,#32] + adcs s1, s1, l + adcs s2, s2, h + adcs s3, s3, xzr + adcs s4, s4, xzr + adc s5, s5, xzr +.endr + + muldiffnadd s2,s1, c,h,l, t, a0,a1, b1,b0 + adcs s3, s3, c + adcs s4, s4, c + adc s5, s5, c + + muldiffnadd s3,s2, c,h,l, t, a0,a2, b2,b0 + adcs s4, s4, c + adc s5, s5, c + + muldiffnadd s4,s3, c,h,l, t, a1,a2, b2,b1 + adc s5, s5, c +.endm + +S2N_BN_SYMBOL(bignum_mul_6_12): + +stp x19, x20, [sp, #-16]! + +// Multiply the low halves using ADK 3x3->6 + + ldp a0, a1, [x1] + ldp b0, b1, [x2] + ldr a2, [x1, #16] + ldr b2, [x2, #16] + + mul3 0 + stp s0, s1, [x0] + stp s2, s3, [x0, #16] + stp s4, s5, [x0, #32] + +// Multiply the high halves using ADK 3x3->6 + + ldp a0, a1, [x1,#24] + ldp b0, b1, [x2,#24] + ldr a2, [x1, #40] + ldr b2, [x2, #40] + + mul3 1 + + stp s0, s1, [x0, #48] + stp s2, s3, [x0, #64] + stp s4, s5, [x0, #80] + +// Compute t,[a2,a1,a0] = x_hi - x_lo +// and s,[b2,b1,b0] = y_lo - y_hi +// sign-magnitude differences + + ldr t, [x1] + subs a0, a0, t + ldr t, [x1,#8] + sbcs a1, a1, t + ldr t, [x1,#16] + sbcs a2, a2, t + csetm t, cc + + ldr s, [x2] + subs b0, s, b0 + ldr s, [x2,#8] + sbcs b1, s, b1 + ldr s, [x2,#16] + sbcs b2, s, b2 + csetm s, cc + + eor a0, a0, t + subs a0, a0, t + eor a1, a1, t + sbcs a1, a1, t + eor a2, a2, t + sbc a2, a2, t + + eor b0, b0, s + subs b0, b0, s + eor b1, b1, s + sbcs b1, b1, s + eor b2, b2, s + sbc b2, b2, s + +// Save the correct sign for the sub-product + + eor s, s, t + +// Now yet another 3x3->6 ADK core, but not writing back, keeping s0..s5 + + mul3 0 + +// Now accumulate the positive mid-terms as [u5,u4,u3,u2,u1,u0] + + ldp u0, u1, [z] + ldp u3, u4, [z,#48] + adds u0, u0, u3 + adcs u1, u1, u4 + ldr u2, [z,#16] + ldp u5, u6, [z,#64] + adcs u2, u2, u5 + adcs u3, u3, u6 + ldp u7, u8, [z,#80] + adcs u4, u4, u7 + adcs u5, u5, u8 + +// Stop the carry here so we can reintroduce it, taking into account the +// effective addition of s from sign-extension below. Note that we get +// a duplicated word c+carry beyond the first one, so this upper part is +// of the form [d,d,t]. + + adcs t, s, xzr + adc d, s, xzr + +// Add in the sign-adjusted complex term + + adds xzr, s, #1 + eor s0, s0, s + adcs u0, s0, u0 + eor s1, s1, s + adcs u1, s1, u1 + eor s2, s2, s + adcs u2, s2, u2 + eor s3, s3, s + adcs u3, s3, u3 + eor s4, s4, s + adcs u4, s4, u4 + eor s5, s5, s + adcs u5, s5, u5 + adcs u6, u6, t + adcs u7, u7, d + adc u8, u8, d + +// Store it back + + str u0, [x0,#24] + stp u1, u2, [x0,#32] + stp u3, u4, [x0,#48] + stp u5, u6, [x0,#64] + stp u7, u8, [x0,#80] + +// Restore regs and return + + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_6_12_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_6_12_alt.S new file mode 100644 index 00000000000..72dfd7fcd69 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_6_12_alt.S @@ -0,0 +1,264 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[6], y[6]; output z[12] +// +// extern void bignum_mul_6_12_alt +// (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 + +// These are repeated mod 2 as we load pairs of inputs + +#define a0 x3 +#define a1 x4 +#define a2 x3 +#define a3 x4 +#define a4 x3 +#define a5 x4 + +#define b0 x5 +#define b1 x6 +#define b2 x7 +#define b3 x8 +#define b4 x9 +#define b5 x10 + +#define t x11 + +// These repeat mod 8 as we write back + +#define u0 x12 +#define u1 x13 +#define u2 x14 +#define u3 x15 +#define u4 x16 +#define u5 x17 +#define u6 x19 +#define u7 x20 +#define u8 x12 +#define u9 x13 +#define u10 x14 +#define u11 x15 + +S2N_BN_SYMBOL(bignum_mul_6_12_alt): + +// Save more registers + + stp x19, x20, [sp, #-16]! + +// Load operands and set up row 0 = [u6;...;u0] = a0 * [b5;...;b0] + + ldp a0, a1, [x] + ldp b0, b1, [y] + + mul u0, a0, b0 + umulh u1, a0, b0 + mul t, a0, b1 + umulh u2, a0, b1 + adds u1, u1, t + + ldp b2, b3, [y, #16] + + mul t, a0, b2 + umulh u3, a0, b2 + adcs u2, u2, t + + mul t, a0, b3 + umulh u4, a0, b3 + adcs u3, u3, t + + ldp b4, b5, [y, #32] + + mul t, a0, b4 + umulh u5, a0, b4 + adcs u4, u4, t + + mul t, a0, b5 + umulh u6, a0, b5 + adcs u5, u5, t + + adc u6, u6, xzr + +// Row 1 = [u7;...;u0] = [a1;a0] * [b5;...;b0] + + mul t, a1, b0 + adds u1, u1, t + mul t, a1, b1 + adcs u2, u2, t + mul t, a1, b2 + adcs u3, u3, t + mul t, a1, b3 + adcs u4, u4, t + mul t, a1, b4 + adcs u5, u5, t + mul t, a1, b5 + adcs u6, u6, t + cset u7, cs + + umulh t, a1, b0 + adds u2, u2, t + umulh t, a1, b1 + adcs u3, u3, t + umulh t, a1, b2 + adcs u4, u4, t + umulh t, a1, b3 + adcs u5, u5, t + umulh t, a1, b4 + adcs u6, u6, t + umulh t, a1, b5 + adc u7, u7, t + + stp u0, u1, [z] + +// Row 2 = [u8;...;u0] = [a2;a1;a0] * [b5;...;b0] + + ldp a2, a3, [x, #16] + + mul t, a2, b0 + adds u2, u2, t + mul t, a2, b1 + adcs u3, u3, t + mul t, a2, b2 + adcs u4, u4, t + mul t, a2, b3 + adcs u5, u5, t + mul t, a2, b4 + adcs u6, u6, t + mul t, a2, b5 + adcs u7, u7, t + cset u8, cs + + umulh t, a2, b0 + adds u3, u3, t + umulh t, a2, b1 + adcs u4, u4, t + umulh t, a2, b2 + adcs u5, u5, t + umulh t, a2, b3 + adcs u6, u6, t + umulh t, a2, b4 + adcs u7, u7, t + umulh t, a2, b5 + adc u8, u8, t + +// Row 3 = [u9;...;u0] = [a3;a2;a1;a0] * [b5;...;b0] + + mul t, a3, b0 + adds u3, u3, t + mul t, a3, b1 + adcs u4, u4, t + mul t, a3, b2 + adcs u5, u5, t + mul t, a3, b3 + adcs u6, u6, t + mul t, a3, b4 + adcs u7, u7, t + mul t, a3, b5 + adcs u8, u8, t + cset u9, cs + + umulh t, a3, b0 + adds u4, u4, t + umulh t, a3, b1 + adcs u5, u5, t + umulh t, a3, b2 + adcs u6, u6, t + umulh t, a3, b3 + adcs u7, u7, t + umulh t, a3, b4 + adcs u8, u8, t + umulh t, a3, b5 + adc u9, u9, t + + stp u2, u3, [z, #16] + +// Row 4 = [u10;...;u0] = [a4;a3;a2;a1;a0] * [b5;...;b0] + + ldp a4, a5, [x, #32] + + mul t, a4, b0 + adds u4, u4, t + mul t, a4, b1 + adcs u5, u5, t + mul t, a4, b2 + adcs u6, u6, t + mul t, a4, b3 + adcs u7, u7, t + mul t, a4, b4 + adcs u8, u8, t + mul t, a4, b5 + adcs u9, u9, t + cset u10, cs + + umulh t, a4, b0 + adds u5, u5, t + umulh t, a4, b1 + adcs u6, u6, t + umulh t, a4, b2 + adcs u7, u7, t + umulh t, a4, b3 + adcs u8, u8, t + umulh t, a4, b4 + adcs u9, u9, t + umulh t, a4, b5 + adc u10, u10, t + +// Row 5 = [u11;...;u0] = [a5;a4;a3;a2;a1;a0] * [b5;...;b0] + + mul t, a5, b0 + adds u5, u5, t + mul t, a5, b1 + adcs u6, u6, t + mul t, a5, b2 + adcs u7, u7, t + mul t, a5, b3 + adcs u8, u8, t + mul t, a5, b4 + adcs u9, u9, t + mul t, a5, b5 + adcs u10, u10, t + cset u11, cs + + umulh t, a5, b0 + adds u6, u6, t + umulh t, a5, b1 + adcs u7, u7, t + umulh t, a5, b2 + adcs u8, u8, t + umulh t, a5, b3 + adcs u9, u9, t + umulh t, a5, b4 + adcs u10, u10, t + umulh t, a5, b5 + adc u11, u11, t + + stp u4, u5, [z, #32] + +// Store back remaining digits of final result + + stp u6, u7, [z, #48] + stp u8, u9, [z, #64] + stp u10, u11, [z, #80] + +// Restore registers and return + + ldp x19, x20, [sp], #16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_8_16.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_8_16.S new file mode 100644 index 00000000000..5aa9b386945 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_8_16.S @@ -0,0 +1,521 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[8], y[8]; output z[16] +// +// extern void bignum_mul_8_16 +// (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_mul_8_16): + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + ldp x3, x4, [x1] + ldr q0, [x1] + ldp x7, x8, [x2] + ldr q1, [x2] + ldp x5, x6, [x1, #16] + ldr q2, [x1, #16] + ldp x9, x10, [x2, #16] + ldr q3, [x2, #16] + uzp1 v4.4s, v1.4s, v0.4s + rev64 v1.4s, v1.4s + uzp1 v5.4s, v0.4s, v0.4s + mul v0.4s, v1.4s, v0.4s + uaddlp v0.2d, v0.4s + shl v0.2d, v0.2d, #32 + umlal v0.2d, v5.2s, v4.2s + mov x11, v0.d[0] + mov x15, v0.d[1] + uzp1 v0.4s, v3.4s, v2.4s + rev64 v1.4s, v3.4s + uzp1 v3.4s, v2.4s, v2.4s + mul v1.4s, v1.4s, v2.4s + uaddlp v1.2d, v1.4s + shl v1.2d, v1.2d, #32 + umlal v1.2d, v3.2s, v0.2s + mov x16, v1.d[0] + mov x17, v1.d[1] + ldr q0, [x1, #32] + ldr q1, [x2, #32] + ldr q2, [x1, #48] + ldr q3, [x2, #48] + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + uzp1 v4.4s, v1.4s, v0.4s + rev64 v1.4s, v1.4s + uzp1 v5.4s, v0.4s, v0.4s + mul v0.4s, v1.4s, v0.4s + uaddlp v0.2d, v0.4s + shl v0.2d, v0.2d, #32 + umlal v0.2d, v5.2s, v4.2s + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x9 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x16, x16, x22 + eor x21, x21, x20 + adcs x17, x17, x21 + adc x19, x19, x20 + subs x24, x3, x4 + cneg x24, x24, cc + csetm x20, cc + subs x21, x8, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x12, x12, x22 + eor x21, x21, x20 + adcs x13, x13, x21 + adcs x14, x14, x20 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x15, x15, x22 + eor x21, x21, x20 + adcs x16, x16, x21 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x13, x13, x22 + eor x21, x21, x20 + adcs x14, x14, x21 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + ldp x3, x4, [x1, #32] + stp x11, x12, [x0] + ldp x7, x8, [x2, #32] + stp x13, x14, [x0, #16] + ldp x5, x6, [x1, #48] + stp x15, x16, [x0, #32] + ldp x9, x10, [x2, #48] + stp x17, x19, [x0, #48] + mov x11, v0.d[0] + mov x15, v0.d[1] + uzp1 v0.4s, v3.4s, v2.4s + rev64 v1.4s, v3.4s + uzp1 v3.4s, v2.4s, v2.4s + mul v1.4s, v1.4s, v2.4s + uaddlp v1.2d, v1.4s + shl v1.2d, v1.2d, #32 + umlal v1.2d, v3.2s, v0.2s + mov x16, v1.d[0] + mov x17, v1.d[1] + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + ldp x22, x21, [x0, #32] + adds x11, x11, x22 + adcs x12, x12, x21 + ldp x22, x21, [x0, #48] + adcs x13, x13, x22 + adcs x14, x14, x21 + adcs x15, x15, xzr + adcs x16, x16, xzr + adcs x17, x17, xzr + adc x19, x19, xzr + subs x24, x5, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x9 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x16, x16, x22 + eor x21, x21, x20 + adcs x17, x17, x21 + adc x19, x19, x20 + subs x24, x3, x4 + cneg x24, x24, cc + csetm x20, cc + subs x21, x8, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x12, x12, x22 + eor x21, x21, x20 + adcs x13, x13, x21 + adcs x14, x14, x20 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x15, x15, x22 + eor x21, x21, x20 + adcs x16, x16, x21 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x13, x13, x22 + eor x21, x21, x20 + adcs x14, x14, x21 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + ldp x22, x21, [x1] + subs x3, x3, x22 + sbcs x4, x4, x21 + ldp x22, x21, [x1, #16] + sbcs x5, x5, x22 + sbcs x6, x6, x21 + csetm x24, cc + stp x11, x12, [x0, #64] + ldp x22, x21, [x2] + subs x7, x22, x7 + sbcs x8, x21, x8 + ldp x22, x21, [x2, #16] + sbcs x9, x22, x9 + sbcs x10, x21, x10 + csetm x1, cc + stp x13, x14, [x0, #80] + eor x3, x3, x24 + subs x3, x3, x24 + eor x4, x4, x24 + sbcs x4, x4, x24 + eor x5, x5, x24 + sbcs x5, x5, x24 + eor x6, x6, x24 + sbc x6, x6, x24 + stp x15, x16, [x0, #96] + eor x7, x7, x1 + subs x7, x7, x1 + eor x8, x8, x1 + sbcs x8, x8, x1 + eor x9, x9, x1 + sbcs x9, x9, x1 + eor x10, x10, x1 + sbc x10, x10, x1 + stp x17, x19, [x0, #112] + eor x1, x1, x24 + mul x11, x3, x7 + mul x15, x4, x8 + mul x16, x5, x9 + mul x17, x6, x10 + umulh x19, x3, x7 + adds x15, x15, x19 + umulh x19, x4, x8 + adcs x16, x16, x19 + umulh x19, x5, x9 + adcs x17, x17, x19 + umulh x19, x6, x10 + adc x19, x19, xzr + adds x12, x15, x11 + adcs x15, x16, x15 + adcs x16, x17, x16 + adcs x17, x19, x17 + adc x19, xzr, x19 + adds x13, x15, x11 + adcs x14, x16, x12 + adcs x15, x17, x15 + adcs x16, x19, x16 + adcs x17, xzr, x17 + adc x19, xzr, x19 + subs x24, x5, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x9 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x16, x16, x22 + eor x21, x21, x20 + adcs x17, x17, x21 + adc x19, x19, x20 + subs x24, x3, x4 + cneg x24, x24, cc + csetm x20, cc + subs x21, x8, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x12, x12, x22 + eor x21, x21, x20 + adcs x13, x13, x21 + adcs x14, x14, x20 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x15, x15, x22 + eor x21, x21, x20 + adcs x16, x16, x21 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x13, x13, x22 + eor x21, x21, x20 + adcs x14, x14, x21 + adcs x15, x15, x20 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x3, x6 + cneg x24, x24, cc + csetm x20, cc + subs x21, x10, x7 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + subs x24, x4, x5 + cneg x24, x24, cc + csetm x20, cc + subs x21, x9, x8 + cneg x21, x21, cc + mul x22, x24, x21 + umulh x21, x24, x21 + cinv x20, x20, cc + cmn x20, #0x1 + eor x22, x22, x20 + adcs x14, x14, x22 + eor x21, x21, x20 + adcs x15, x15, x21 + adcs x16, x16, x20 + adcs x17, x17, x20 + adc x19, x19, x20 + ldp x3, x4, [x0] + ldp x7, x8, [x0, #64] + adds x3, x3, x7 + adcs x4, x4, x8 + ldp x5, x6, [x0, #16] + ldp x9, x10, [x0, #80] + adcs x5, x5, x9 + adcs x6, x6, x10 + ldp x20, x21, [x0, #96] + adcs x7, x7, x20 + adcs x8, x8, x21 + ldp x22, x23, [x0, #112] + adcs x9, x9, x22 + adcs x10, x10, x23 + adcs x24, x1, xzr + adc x2, x1, xzr + cmn x1, #0x1 + eor x11, x11, x1 + adcs x3, x11, x3 + eor x12, x12, x1 + adcs x4, x12, x4 + eor x13, x13, x1 + adcs x5, x13, x5 + eor x14, x14, x1 + adcs x6, x14, x6 + eor x15, x15, x1 + adcs x7, x15, x7 + eor x16, x16, x1 + adcs x8, x16, x8 + eor x17, x17, x1 + adcs x9, x17, x9 + eor x19, x19, x1 + adcs x10, x19, x10 + adcs x20, x20, x24 + adcs x21, x21, x2 + adcs x22, x22, x2 + adc x23, x23, x2 + stp x3, x4, [x0, #32] + stp x5, x6, [x0, #48] + stp x7, x8, [x0, #64] + stp x9, x10, [x0, #80] + stp x20, x21, [x0, #96] + stp x22, x23, [x0, #112] + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_8_16_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_8_16_alt.S new file mode 100644 index 00000000000..2d0a80e1c80 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_mul_8_16_alt.S @@ -0,0 +1,406 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[8], y[8]; output z[16] +// +// extern void bignum_mul_8_16_alt +// (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 + +// These are repeated mod 2 as we load paris of inputs + +#define a0 x3 +#define a1 x4 +#define a2 x3 +#define a3 x4 +#define a4 x3 +#define a5 x4 +#define a6 x3 +#define a7 x4 + +#define b0 x5 +#define b1 x6 +#define b2 x7 +#define b3 x8 +#define b4 x9 +#define b5 x10 +#define b6 x11 +#define b7 x12 + +#define t x13 + +// These repeat mod 10 as we write back + +#define u0 x14 +#define u1 x15 +#define u2 x16 +#define u3 x17 +#define u4 x19 +#define u5 x20 +#define u6 x21 +#define u7 x22 +#define u8 x23 +#define u9 x24 +#define u10 x14 +#define u11 x15 +#define u12 x16 +#define u13 x17 +#define u14 x19 +#define u15 x20 + +S2N_BN_SYMBOL(bignum_mul_8_16_alt): + +// Save more registers + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + +// Load operands and set up row 0 = [u8;...;u0] = a0 * [b7;...;b0] + + ldp a0, a1, [x] + ldp b0, b1, [y] + + mul u0, a0, b0 + umulh u1, a0, b0 + mul t, a0, b1 + umulh u2, a0, b1 + adds u1, u1, t + + ldp b2, b3, [y, #16] + + mul t, a0, b2 + umulh u3, a0, b2 + adcs u2, u2, t + + mul t, a0, b3 + umulh u4, a0, b3 + adcs u3, u3, t + + ldp b4, b5, [y, #32] + + mul t, a0, b4 + umulh u5, a0, b4 + adcs u4, u4, t + + mul t, a0, b5 + umulh u6, a0, b5 + adcs u5, u5, t + + ldp b6, b7, [y, #48] + + mul t, a0, b6 + umulh u7, a0, b6 + adcs u6, u6, t + + mul t, a0, b7 + umulh u8, a0, b7 + adcs u7, u7, t + + adc u8, u8, xzr + +// Row 1 = [u9;...;u0] = [a1;a0] * [b7;...;b0] + + mul t, a1, b0 + adds u1, u1, t + mul t, a1, b1 + adcs u2, u2, t + mul t, a1, b2 + adcs u3, u3, t + mul t, a1, b3 + adcs u4, u4, t + mul t, a1, b4 + adcs u5, u5, t + mul t, a1, b5 + adcs u6, u6, t + mul t, a1, b6 + adcs u7, u7, t + mul t, a1, b7 + adcs u8, u8, t + cset u9, cs + + umulh t, a1, b0 + adds u2, u2, t + umulh t, a1, b1 + adcs u3, u3, t + umulh t, a1, b2 + adcs u4, u4, t + umulh t, a1, b3 + adcs u5, u5, t + umulh t, a1, b4 + adcs u6, u6, t + umulh t, a1, b5 + adcs u7, u7, t + umulh t, a1, b6 + adcs u8, u8, t + umulh t, a1, b7 + adc u9, u9, t + + stp u0, u1, [z] + +// Row 2 = [u10;...;u0] = [a2;a1;a0] * [b7;...;b0] + + ldp a2, a3, [x, #16] + + mul t, a2, b0 + adds u2, u2, t + mul t, a2, b1 + adcs u3, u3, t + mul t, a2, b2 + adcs u4, u4, t + mul t, a2, b3 + adcs u5, u5, t + mul t, a2, b4 + adcs u6, u6, t + mul t, a2, b5 + adcs u7, u7, t + mul t, a2, b6 + adcs u8, u8, t + mul t, a2, b7 + adcs u9, u9, t + cset u10, cs + + umulh t, a2, b0 + adds u3, u3, t + umulh t, a2, b1 + adcs u4, u4, t + umulh t, a2, b2 + adcs u5, u5, t + umulh t, a2, b3 + adcs u6, u6, t + umulh t, a2, b4 + adcs u7, u7, t + umulh t, a2, b5 + adcs u8, u8, t + umulh t, a2, b6 + adcs u9, u9, t + umulh t, a2, b7 + adc u10, u10, t + +// Row 3 = [u11;...;u0] = [a3;a2;a1;a0] * [b7;...;b0] + + mul t, a3, b0 + adds u3, u3, t + mul t, a3, b1 + adcs u4, u4, t + mul t, a3, b2 + adcs u5, u5, t + mul t, a3, b3 + adcs u6, u6, t + mul t, a3, b4 + adcs u7, u7, t + mul t, a3, b5 + adcs u8, u8, t + mul t, a3, b6 + adcs u9, u9, t + mul t, a3, b7 + adcs u10, u10, t + cset u11, cs + + umulh t, a3, b0 + adds u4, u4, t + umulh t, a3, b1 + adcs u5, u5, t + umulh t, a3, b2 + adcs u6, u6, t + umulh t, a3, b3 + adcs u7, u7, t + umulh t, a3, b4 + adcs u8, u8, t + umulh t, a3, b5 + adcs u9, u9, t + umulh t, a3, b6 + adcs u10, u10, t + umulh t, a3, b7 + adc u11, u11, t + + stp u2, u3, [z, #16] + +// Row 4 = [u12;...;u0] = [a4;a3;a2;a1;a0] * [b7;...;b0] + + ldp a4, a5, [x, #32] + + mul t, a4, b0 + adds u4, u4, t + mul t, a4, b1 + adcs u5, u5, t + mul t, a4, b2 + adcs u6, u6, t + mul t, a4, b3 + adcs u7, u7, t + mul t, a4, b4 + adcs u8, u8, t + mul t, a4, b5 + adcs u9, u9, t + mul t, a4, b6 + adcs u10, u10, t + mul t, a4, b7 + adcs u11, u11, t + cset u12, cs + + umulh t, a4, b0 + adds u5, u5, t + umulh t, a4, b1 + adcs u6, u6, t + umulh t, a4, b2 + adcs u7, u7, t + umulh t, a4, b3 + adcs u8, u8, t + umulh t, a4, b4 + adcs u9, u9, t + umulh t, a4, b5 + adcs u10, u10, t + umulh t, a4, b6 + adcs u11, u11, t + umulh t, a4, b7 + adc u12, u12, t + +// Row 5 = [u13;...;u0] = [a5;a4;a3;a2;a1;a0] * [b7;...;b0] + + mul t, a5, b0 + adds u5, u5, t + mul t, a5, b1 + adcs u6, u6, t + mul t, a5, b2 + adcs u7, u7, t + mul t, a5, b3 + adcs u8, u8, t + mul t, a5, b4 + adcs u9, u9, t + mul t, a5, b5 + adcs u10, u10, t + mul t, a5, b6 + adcs u11, u11, t + mul t, a5, b7 + adcs u12, u12, t + cset u13, cs + + umulh t, a5, b0 + adds u6, u6, t + umulh t, a5, b1 + adcs u7, u7, t + umulh t, a5, b2 + adcs u8, u8, t + umulh t, a5, b3 + adcs u9, u9, t + umulh t, a5, b4 + adcs u10, u10, t + umulh t, a5, b5 + adcs u11, u11, t + umulh t, a5, b6 + adcs u12, u12, t + umulh t, a5, b7 + adc u13, u13, t + + stp u4, u5, [z, #32] + +// Row 6 = [u14;...;u0] = [a6;a5;a4;a3;a2;a1;a0] * [b7;...;b0] + + ldp a6, a7, [x, #48] + + mul t, a6, b0 + adds u6, u6, t + mul t, a6, b1 + adcs u7, u7, t + mul t, a6, b2 + adcs u8, u8, t + mul t, a6, b3 + adcs u9, u9, t + mul t, a6, b4 + adcs u10, u10, t + mul t, a6, b5 + adcs u11, u11, t + mul t, a6, b6 + adcs u12, u12, t + mul t, a6, b7 + adcs u13, u13, t + cset u14, cs + + umulh t, a6, b0 + adds u7, u7, t + umulh t, a6, b1 + adcs u8, u8, t + umulh t, a6, b2 + adcs u9, u9, t + umulh t, a6, b3 + adcs u10, u10, t + umulh t, a6, b4 + adcs u11, u11, t + umulh t, a6, b5 + adcs u12, u12, t + umulh t, a6, b6 + adcs u13, u13, t + umulh t, a6, b7 + adc u14, u14, t + +// Row 7 = [u15;...;u0] = [a7;a6;a5;a4;a3;a2;a1;a0] * [b7;...;b0] + + mul t, a7, b0 + adds u7, u7, t + mul t, a7, b1 + adcs u8, u8, t + mul t, a7, b2 + adcs u9, u9, t + mul t, a7, b3 + adcs u10, u10, t + mul t, a7, b4 + adcs u11, u11, t + mul t, a7, b5 + adcs u12, u12, t + mul t, a7, b6 + adcs u13, u13, t + mul t, a7, b7 + adcs u14, u14, t + cset u15, cs + + umulh t, a7, b0 + adds u8, u8, t + umulh t, a7, b1 + adcs u9, u9, t + umulh t, a7, b2 + adcs u10, u10, t + umulh t, a7, b3 + adcs u11, u11, t + umulh t, a7, b4 + adcs u12, u12, t + umulh t, a7, b5 + adcs u13, u13, t + umulh t, a7, b6 + adcs u14, u14, t + umulh t, a7, b7 + adc u15, u15, t + + stp u6, u7, [z, #48] + +// Store back remaining digits of final result + + stp u8, u9, [z, #64] + stp u10, u11, [z, #80] + stp u12, u13, [z, #96] + stp u14, u15, [z, #112] + +// Restore registers + + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_4_8.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_4_8.S new file mode 100644 index 00000000000..e6fb56c6e31 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_4_8.S @@ -0,0 +1,144 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[4]; output z[8] +// +// extern void bignum_sqr_4_8 (uint64_t z[static 8], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// 2x2 squaring macro: [s3;s2;s1;s0] := [a1;a0]^2 with t0,t1,t2 temporaries +// This uses 32x32->64 multiplications to reduce the number of UMULHs +// --------------------------------------------------------------------------- + +#define sqr2(s3,s2,s1,s0, a1,a1short,a0,a0short, t2,t1,t0,t0short) \ + umull s0, a0short, a0short __LF \ + lsr t0, a0, #32 __LF \ + umull s1, t0short, t0short __LF \ + umull t0, a0short, t0short __LF \ + adds s0, s0, t0, lsl #33 __LF \ + lsr t0, t0, #31 __LF \ + adc s1, s1, t0 __LF \ + umull s2, a1short, a1short __LF \ + lsr t0, a1, #32 __LF \ + umull s3, t0short, t0short __LF \ + umull t0, a1short, t0short __LF \ + mul t1, a0, a1 __LF \ + umulh t2, a0, a1 __LF \ + adds s2, s2, t0, lsl #33 __LF \ + lsr t0, t0, #31 __LF \ + adc s3, s3, t0 __LF \ + adds t1, t1, t1 __LF \ + adcs t2, t2, t2 __LF \ + adc s3, s3, xzr __LF \ + adds s1, s1, t1 __LF \ + adcs s2, s2, t2 __LF \ + adc s3, s3, xzr + +// Main code + +#define z x0 +#define x x1 + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 + +#define s0 x6 +#define s1 x7 +#define s2 x8 +#define s3 x9 +#define s4 x10 +#define s5 x11 +#define s6 x12 +#define s7 x13 + +#define d0 x14 +#define d1 x15 +#define d2 x16 + +// Short versions + +#define a0short w2 +#define a1short w3 +#define a2short w4 +#define a3short w5 +#define d2short w16 +#define s3short w9 + +S2N_BN_SYMBOL(bignum_sqr_4_8): + +// Load all the elements + + ldp a0, a1, [x] + ldp a2, a3, [x, #16] + +// Compute L = [s3;s2;s1;s0] = square of lower half + + sqr2(s3,s2,s1,s0, a1,a1short,a0,a0short, d0,d1,d2,d2short) + +// Compute H = [s7;s6;s5;s4] = square of upper half + + sqr2(s7,s6,s5,s4, a3,a3short,a2,a2short, d0,d1,d2,d2short) + +// Let [a1;a0] = |[a3;a2] - [a1;a0]| be the absolute difference + + subs a0, a0, a2 + sbcs a1, a1, a3 + csetm d0, cc + eor a0, a0, d0 + subs a0, a0, d0 + eor a1, a1, d0 + sbc a1, a1, d0 + +// Form H' = H + L_hi (which fits in 4 words) + + adds s4, s4, s2 + adcs s5, s5, s3 + adcs s6, s6, xzr + adc s7, s7, xzr + +// Let M = [d2;d1;a3;a2] = ([a3;a2] - [a1;a0])^2 + + sqr2(d2,d1,a3,a2, a1,a1short,a0,a0short, d0,s2,s3,s3short) + +// Now form (2^64 + 1) * (H'::L), with a bit of carry-shortening + + adds s2, s0, s4 + adcs s3, s1, s5 + adcs s4, s4, s6 + adcs s5, s5, s7 + csetm d0, cc + +// Subtract the middle term M + + subs s2, s2, a2 + sbcs s3, s3, a3 + sbcs s4, s4, d1 + sbcs s5, s5, d2 + adcs s6, s6, d0 + adc s7, s7, d0 + +// Store back + + stp s0, s1, [z] + stp s2, s3, [z, 16] + stp s4, s5, [z, 32] + stp s6, s7, [z, 48] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_4_8_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_4_8_alt.S new file mode 100644 index 00000000000..b7e5eed3515 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_4_8_alt.S @@ -0,0 +1,123 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[4]; output z[8] +// +// extern void bignum_sqr_4_8_alt +// (uint64_t z[static 8], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8_alt) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 + +#define l x6 +#define h x7 + +#define u0 x8 +#define u1 x9 +#define u2 x10 +#define u3 x11 +#define u4 x12 +#define u5 x13 +#define u6 x14 + +// This one is the same as h, which is safe with this computation sequence + +#define u7 h + +S2N_BN_SYMBOL(bignum_sqr_4_8_alt): + +// Load all the elements, set up an initial window [u6;...u1] = [23;03;01] +// and chain in the addition of 02 + 12 + 13 (no carry-out is possible). +// This gives all the "heterogeneous" terms of the squaring ready to double + + ldp a0, a1, [x] + + mul u1, a0, a1 + umulh u2, a0, a1 + + ldp a2, a3, [x, #16] + + mul u3, a0, a3 + umulh u4, a0, a3 + + mul l, a0, a2 + umulh h, a0, a2 + adds u2, u2, l + + adcs u3, u3, h + mul l, a1, a2 + umulh h, a1, a2 + adc h, h, xzr + adds u3, u3, l + + mul u5, a2, a3 + umulh u6, a2, a3 + + adcs u4, u4, h + mul l, a1, a3 + umulh h, a1, a3 + adc h, h, xzr + adds u4, u4, l + + adcs u5, u5, h + adc u6, u6, xzr + +// Now just double it; this simple approach seems to work better than extr + + adds u1, u1, u1 + adcs u2, u2, u2 + adcs u3, u3, u3 + adcs u4, u4, u4 + adcs u5, u5, u5 + adcs u6, u6, u6 + cset u7, cs + +// Add the homogeneous terms 00 + 11 + 22 + 33 + + umulh l, a0, a0 + mul u0, a0, a0 + adds u1, u1, l + + mul l, a1, a1 + adcs u2, u2, l + umulh l, a1, a1 + adcs u3, u3, l + + mul l, a2, a2 + adcs u4, u4, l + umulh l, a2, a2 + adcs u5, u5, l + + mul l, a3, a3 + adcs u6, u6, l + umulh l, a3, a3 + adc u7, u7, l + +// Store back final result + + stp u0, u1, [z] + stp u2, u3, [z, #16] + stp u4, u5, [z, #32] + stp u6, u7, [z, #48] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_6_12.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_6_12.S new file mode 100644 index 00000000000..04e530989d1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_6_12.S @@ -0,0 +1,261 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[6]; output z[12] +// +// extern void bignum_sqr_6_12 (uint64_t z[static 12], uint64_t x[static 6]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z) +// c,h,l,t should all be different +// t,h should not overlap w,z +// --------------------------------------------------------------------------- + +.macro muldiffn c,h,l, t, x,y, w,z + subs \t, \x, \y + cneg \t, \t, cc + csetm \c, cc + subs \h, \w, \z + cneg \h, \h, cc + mul \l, \t, \h + umulh \h, \t, \h + cinv \c, \c, cc + eor \l, \l, \c + eor \h, \h, \c +.endm + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 +#define a4 x6 +#define a5 x7 + +#define c0 x8 +#define c1 x9 +#define c2 x10 +#define c3 x11 +#define c4 x12 +#define c5 x13 +#define d1 x14 +#define d2 x15 +#define d3 x16 +#define d4 x17 + +S2N_BN_SYMBOL(bignum_sqr_6_12): + +// Load in all words of the input + + ldp a0, a1, [x1] + ldp a2, a3, [x1, #16] + ldp a4, a5, [x1, #32] + +// Square the low half + + mul d1, a0, a1 + mul d2, a0, a2 + mul d3, a1, a2 + mul c0, a0, a0 + str c0, [x0] + mul c2, a1, a1 + mul c4, a2, a2 + + umulh d4, a0, a1 + adds d2, d2, d4 + umulh d4, a0, a2 + adcs d3, d3, d4 + umulh d4, a1, a2 + adcs d4, d4, xzr + + umulh c1, a0, a0 + umulh c3, a1, a1 + umulh c5, a2, a2 + + adds d1, d1, d1 + adcs d2, d2, d2 + adcs d3, d3, d3 + adcs d4, d4, d4 + adc c5, c5, xzr + + adds c1, c1, d1 + str c1, [x0,#8] + adcs c2, c2, d2 + str c2, [x0,#16] + adcs c3, c3, d3 + str c3, [x0,#24] + adcs c4, c4, d4 + str c4, [x0,#32] + adc c5, c5, xzr + str c5, [x0,#40] + +// Square the high half + + mul d1, a3, a4 + mul d2, a3, a5 + mul d3, a4, a5 + mul c0, a3, a3 + str c0, [x0,#48] + mul c2, a4, a4 + mul c4, a5, a5 + + umulh d4, a3, a4 + adds d2, d2, d4 + umulh d4, a3, a5 + adcs d3, d3, d4 + umulh d4, a4, a5 + adcs d4, d4, xzr + + umulh c1, a3, a3 + umulh c3, a4, a4 + umulh c5, a5, a5 + + adds d1, d1, d1 + adcs d2, d2, d2 + adcs d3, d3, d3 + adcs d4, d4, d4 + adc c5, c5, xzr + + adds c1, c1, d1 + str c1, [x0,#56] + adcs c2, c2, d2 + str c2, [x0,#64] + adcs c3, c3, d3 + str c3, [x0,#72] + adcs c4, c4, d4 + str c4, [x0,#80] + adc c5, c5, xzr + str c5, [x0,#88] + +// Compute product of the cross-term with ADK 3x3->6 multiplier + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 +#define a4 x6 +#define a5 x7 +#define s0 x8 +#define s1 x9 +#define s2 x10 +#define s3 x11 +#define s4 x12 +#define s5 x13 + +#define l1 x14 +#define l2 x15 +#define h0 x16 +#define h1 x17 +#define h2 x13 + +#define s6 h1 +#define c l1 +#define h l2 +#define l h0 +#define t h1 + + mul s0, a0, a3 + mul l1, a1, a4 + mul l2, a2, a5 + umulh h0, a0, a3 + umulh h1, a1, a4 + umulh h2, a2, a5 + + adds h0, h0, l1 + adcs h1, h1, l2 + adc h2, h2, xzr + + adds s1, h0, s0 + adcs s2, h1, h0 + adcs s3, h2, h1 + adc s4, h2, xzr + + adds s2, s2, s0 + adcs s3, s3, h0 + adcs s4, s4, h1 + adc s5, h2, xzr + + muldiffn c,h,l, t, a0,a1, a4,a3 + adds xzr, c, #1 + adcs s1, s1, l + adcs s2, s2, h + adcs s3, s3, c + adcs s4, s4, c + adc s5, s5, c + + muldiffn c,h,l, t, a0,a2, a5,a3 + adds xzr, c, #1 + adcs s2, s2, l + adcs s3, s3, h + adcs s4, s4, c + adc s5, s5, c + + muldiffn c,h,l, t, a1,a2, a5,a4 + adds xzr, c, #1 + adcs s3, s3, l + adcs s4, s4, h + adc s5, s5, c + +// Double it, catching the carry + + adds s0, s0, s0 + adcs s1, s1, s1 + adcs s2, s2, s2 + adcs s3, s3, s3 + adcs s4, s4, s4 + adcs s5, s5, s5 + adc s6, xzr, xzr + +// Finally, add it into the term + + ldr a0, [x0, #24] + adds a0, a0, s0 + str a0, [x0, #24] + + ldr a0, [x0, #32] + adcs a0, a0, s1 + str a0, [x0, #32] + + ldr a0, [x0, #40] + adcs a0, a0, s2 + str a0, [x0, #40] + + ldr a0, [x0, #48] + adcs a0, a0, s3 + str a0, [x0, #48] + + ldr a0, [x0, #56] + adcs a0, a0, s4 + str a0, [x0, #56] + + ldr a0, [x0, #64] + adcs a0, a0, s5 + str a0, [x0, #64] + + ldr a0, [x0, #72] + adcs a0, a0, s6 + str a0, [x0, #72] + + ldr a0, [x0, #80] + adcs a0, a0, xzr + str a0, [x0, #80] + + ldr a0, [x0, #88] + adc a0, a0, xzr + str a0, [x0, #88] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_6_12_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_6_12_alt.S new file mode 100644 index 00000000000..deec8f287fe --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_6_12_alt.S @@ -0,0 +1,192 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[6]; output z[12] +// +// extern void bignum_sqr_6_12_alt (uint64_t z[static 12], uint64_t x[static 6]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 +#define a4 x6 +#define a5 x7 + +#define l x8 + +#define u0 x2 // The same as a0, which is safe +#define u1 x9 +#define u2 x10 +#define u3 x11 +#define u4 x12 +#define u5 x13 +#define u6 x14 +#define u7 x15 +#define u8 x16 +#define u9 x17 +#define u10 x19 +#define u11 x20 + +S2N_BN_SYMBOL(bignum_sqr_6_12_alt): + +// It's convenient to have two more registers to play with + + stp x19, x20, [sp, #-16]! + +// Load all the elements as [a5;a4;a3;a2;a1;a0], set up an initial +// window [u8;u7; u6;u5; u4;u3; u2;u1] = [34;05;03;01], and then +// chain in the addition of 02 + 12 + 13 + 14 + 15 to that window +// (no carry-out possible since we add it to the top of a product). + + ldp a0, a1, [x] + + mul u1, a0, a1 + umulh u2, a0, a1 + + ldp a2, a3, [x, #16] + + mul l, a0, a2 + adds u2, u2, l + + mul u3, a0, a3 + mul l, a1, a2 + adcs u3, u3, l + + umulh u4, a0, a3 + mul l, a1, a3 + adcs u4, u4, l + + ldp a4, a5, [x, #32] + + mul u5, a0, a5 + mul l, a1, a4 + adcs u5, u5, l + + umulh u6, a0, a5 + mul l, a1, a5 + adcs u6, u6, l + + mul u7, a3, a4 + adcs u7, u7, xzr + + umulh u8, a3, a4 + adc u8, u8, xzr + + umulh l, a0, a2 + adds u3, u3, l + umulh l, a1, a2 + adcs u4, u4, l + umulh l, a1, a3 + adcs u5, u5, l + umulh l, a1, a4 + adcs u6, u6, l + umulh l, a1, a5 + adcs u7, u7, l + adc u8, u8, xzr + +// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms + + mul l, a0, a4 + adds u4, u4, l + mul l, a2, a3 + adcs u5, u5, l + mul l, a2, a4 + adcs u6, u6, l + mul l, a2, a5 + adcs u7, u7, l + mul l, a3, a5 + adcs u8, u8, l + mul u9, a4, a5 + adcs u9, u9, xzr + umulh u10, a4, a5 + adc u10, u10, xzr + + umulh l, a0, a4 + adds u5, u5, l + umulh l, a2, a3 + adcs u6, u6, l + umulh l, a2, a4 + adcs u7, u7, l + umulh l, a2, a5 + adcs u8, u8, l + umulh l, a3, a5 + adcs u9, u9, l + adc u10, u10, xzr + +// Double that, with h holding the top carry + + adds u1, u1, u1 + adcs u2, u2, u2 + adcs u3, u3, u3 + adcs u4, u4, u4 + adcs u5, u5, u5 + adcs u6, u6, u6 + adcs u7, u7, u7 + adcs u8, u8, u8 + adcs u9, u9, u9 + adcs u10, u10, u10 + cset u11, cs + +// Add the homogeneous terms 00 + 11 + 22 + 33 + 44 + 55 + + umulh l, a0, a0 + mul u0, a0, a0 + adds u1, u1, l + + mul l, a1, a1 + adcs u2, u2, l + umulh l, a1, a1 + adcs u3, u3, l + + mul l, a2, a2 + adcs u4, u4, l + umulh l, a2, a2 + adcs u5, u5, l + + mul l, a3, a3 + adcs u6, u6, l + umulh l, a3, a3 + adcs u7, u7, l + + mul l, a4, a4 + adcs u8, u8, l + umulh l, a4, a4 + adcs u9, u9, l + + mul l, a5, a5 + adcs u10, u10, l + umulh l, a5, a5 + adc u11, u11, l + +// Store back final result + + stp u0, u1, [z] + stp u2, u3, [z, #16] + stp u4, u5, [z, #32] + stp u6, u7, [z, #48] + stp u8, u9, [z, #64] + stp u10, u11, [z, #80] + +// Restore registers and return + + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_8_16.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_8_16.S new file mode 100644 index 00000000000..d79f764ea07 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_8_16.S @@ -0,0 +1,423 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[8]; output z[16] +// +// extern void bignum_sqr_8_16 (uint64_t z[static 16], uint64_t x[static 8]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16) + .text + .balign 4 + + +S2N_BN_SYMBOL(bignum_sqr_8_16): + +// Save registers + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + +// Load registers. + ldp x2, x3, [x1] +ldr q20, [x1] + ldp x4, x5, [x1, #16] +ldr q21, [x1, #16] + ldp x6, x7, [x1, #32] +ldr q22, [x1, #32] + ldp x8, x9, [x1, #48] +ldr q23, [x1, #48] +movi v30.2d, #0xffffffff + + mul x17, x2, x4 + mul x14, x3, x5 + +// Scalar+NEON: square the lower half with a near-clone of bignum_sqr_4_8 +// NEON: prepare 64x64->128 squaring of two 64-bit ints (x2, x3) +ext v1.16b, v20.16b, v20.16b, #8 + umulh x20, x2, x4 +shrn v2.2s, v20.2d, #32 + subs x21, x2, x3 +zip1 v0.2s, v20.2s, v1.2s + cneg x21, x21, cc // cc = lo, ul, last +umull v5.2d, v2.2s, v2.2s + csetm x11, cc // cc = lo, ul, last +umull v6.2d, v2.2s, v0.2s + subs x12, x5, x4 +umull v3.2d, v0.2s, v0.2s + cneg x12, x12, cc // cc = lo, ul, last +mov v1.16b, v6.16b + mul x13, x21, x12 +usra v1.2d, v3.2d, #32 + umulh x12, x21, x12 +and v4.16b, v1.16b, v30.16b + cinv x11, x11, cc // cc = lo, ul, last +add v4.2d, v4.2d, v6.2d + eor x13, x13, x11 +usra v5.2d, v4.2d, #32 + eor x12, x12, x11 +sli v3.2d, v4.2d, #32 + adds x19, x17, x20 +usra v5.2d, v1.2d, #32 + adc x20, x20, xzr + // NEON: prepare 64x64->128 squaring of two 64-bit ints (x4, x5) + ext v1.16b, v21.16b, v21.16b, #8 + umulh x21, x3, x5 + shrn v2.2s, v21.2d, #32 + adds x19, x19, x14 + zip1 v0.2s, v21.2s, v1.2s + adcs x20, x20, x21 + adc x21, x21, xzr + adds x20, x20, x14 + adc x21, x21, xzr + cmn x11, #0x1 + adcs x19, x19, x13 +mov x13, v3.d[1] // mul x13, x3, x3 + adcs x20, x20, x12 +mov x14, v5.d[1] // umulh x14, x3, x3 + adc x21, x21, x11 +mov x12, v3.d[0] // mul x12, x2, x2 + adds x17, x17, x17 +mov x11, v5.d[0] // umulh x11, x2, x2 + adcs x19, x19, x19 + umull v5.2d, v2.2s, v2.2s + adcs x20, x20, x20 + umull v6.2d, v2.2s, v0.2s + adcs x21, x21, x21 + umull v3.2d, v0.2s, v0.2s + adc x10, xzr, xzr + mov v1.16b, v6.16b + + mul x15, x2, x3 + usra v1.2d, v3.2d, #32 + umulh x16, x2, x3 + and v4.16b, v1.16b, v30.16b + adds x11, x11, x15 + add v4.2d, v4.2d, v6.2d + adcs x13, x13, x16 + usra v5.2d, v4.2d, #32 + adc x14, x14, xzr + sli v3.2d, v4.2d, #32 + adds x11, x11, x15 + usra v5.2d, v1.2d, #32 + adcs x13, x13, x16 + adc x14, x14, xzr + stp x12, x11, [x0] + mov x11, v5.d[0] // umulh x11, x4, x4 + adds x17, x17, x13 + mov x13, v3.d[1] // mul x13, x5, x5 + adcs x19, x19, x14 + mov x14, v5.d[1] // umulh x14, x5, x5 + adcs x20, x20, xzr + mov x12, v3.d[0] // mul x12, x4, x4 + adcs x21, x21, xzr +// NEON: prepare muls in the upper half +ext v1.16b, v22.16b, v22.16b, #8 + adc x10, x10, xzr +shrn v2.2s, v22.2d, #32 + stp x17, x19, [x0, #16] +zip1 v0.2s, v22.2s, v1.2s + mul x15, x4, x5 +umull v5.2d, v2.2s, v2.2s + umulh x16, x4, x5 +umull v6.2d, v2.2s, v0.2s + adds x11, x11, x15 +umull v3.2d, v0.2s, v0.2s + adcs x13, x13, x16 +mov v1.16b, v6.16b + adc x14, x14, xzr +usra v1.2d, v3.2d, #32 + adds x11, x11, x15 +and v4.16b, v1.16b, v30.16b + adcs x13, x13, x16 +add v4.2d, v4.2d, v6.2d + adc x14, x14, xzr +usra v5.2d, v4.2d, #32 + adds x12, x12, x20 +sli v3.2d, v4.2d, #32 + adcs x11, x11, x21 +usra v5.2d, v1.2d, #32 + stp x12, x11, [x0, #32] + // NEON: prepare muls in the upper half + ext v1.16b, v23.16b, v23.16b, #8 + adcs x13, x13, x10 + shrn v2.2s, v23.2d, #32 + adc x14, x14, xzr + zip1 v0.2s, v23.2s, v1.2s + stp x13, x14, [x0, #48] + +// Scalar: square the upper half with a slight variant of the previous block + mul x17, x6, x8 + umull v16.2d, v2.2s, v2.2s + mul x14, x7, x9 + umull v6.2d, v2.2s, v0.2s + umulh x20, x6, x8 + umull v18.2d, v0.2s, v0.2s + subs x21, x6, x7 + cneg x21, x21, cc // cc = lo, ul, last + mov v1.16b, v6.16b + csetm x11, cc // cc = lo, ul, last + subs x12, x9, x8 + cneg x12, x12, cc // cc = lo, ul, last + usra v1.2d, v18.2d, #32 + mul x13, x21, x12 + and v4.16b, v1.16b, v30.16b + umulh x12, x21, x12 + add v4.2d, v4.2d, v6.2d + cinv x11, x11, cc // cc = lo, ul, last + eor x13, x13, x11 + eor x12, x12, x11 + usra v16.2d, v4.2d, #32 + adds x19, x17, x20 + adc x20, x20, xzr + sli v18.2d, v4.2d, #32 + umulh x21, x7, x9 + adds x19, x19, x14 + adcs x20, x20, x21 + adc x21, x21, xzr + adds x20, x20, x14 +mov x14, v5.d[1] + adc x21, x21, xzr + cmn x11, #0x1 + adcs x19, x19, x13 +mov x13, v3.d[1] + adcs x20, x20, x12 +mov x12, v3.d[0] + adc x21, x21, x11 +mov x11, v5.d[0] + adds x17, x17, x17 + adcs x19, x19, x19 + usra v16.2d, v1.2d, #32 + adcs x20, x20, x20 + adcs x21, x21, x21 + adc x10, xzr, xzr +// NEON: two mul+umulhs for the next stage +uzp2 v17.4s, v21.4s, v23.4s + mul x15, x6, x7 +xtn v4.2s, v23.2d + umulh x16, x6, x7 + mov x22, v16.d[0] + adds x11, x11, x15 + adcs x13, x13, x16 +xtn v5.2s, v21.2d + adc x14, x14, xzr + adds x11, x11, x15 +rev64 v1.4s, v21.4s + adcs x13, x13, x16 + adc x14, x14, xzr + stp x12, x11, [x0, #64] + adds x17, x17, x13 + mov x13, v18.d[1] + adcs x19, x19, x14 + mov x14, v16.d[1] + adcs x20, x20, xzr + mov x12, v18.d[0] + adcs x21, x21, xzr + adc x10, x10, xzr +umull v6.2d, v4.2s, v5.2s + stp x17, x19, [x0, #80] +umull v7.2d, v4.2s, v17.2s + mul x15, x8, x9 +uzp2 v16.4s, v23.4s, v23.4s + umulh x16, x8, x9 +mul v0.4s, v1.4s, v23.4s + adds x11, x22, x15 + adcs x13, x13, x16 +usra v7.2d, v6.2d, #32 + adc x14, x14, xzr + adds x11, x11, x15 +umull v1.2d, v16.2s, v17.2s + adcs x13, x13, x16 + adc x14, x14, xzr +uaddlp v0.2d, v0.4s + adds x12, x12, x20 + adcs x11, x11, x21 +and v2.16b, v7.16b, v30.16b +umlal v2.2d, v16.2s, v5.2s +shl v0.2d, v0.2d, #32 +usra v1.2d, v7.2d, #32 +umlal v0.2d, v4.2s, v5.2s +mov x16, v0.d[1] +mov x15, v0.d[0] +usra v1.2d, v2.2d, #32 +mov x20, v1.d[0] +mov x21, v1.d[1] + stp x12, x11, [x0, #96] + adcs x13, x13, x10 + adc x14, x14, xzr + stp x13, x14, [x0, #112] + +// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0] + + mul x10, x2, x6 + mul x14, x3, x7 + umulh x17, x2, x6 + adds x14, x14, x17 + umulh x17, x3, x7 + adcs x15, x15, x17 + adcs x16, x16, x20 + adc x17, x21, xzr + adds x11, x14, x10 + adcs x14, x15, x14 + adcs x15, x16, x15 + adcs x16, x17, x16 + adc x17, xzr, x17 + adds x12, x14, x10 + adcs x13, x15, x11 + adcs x14, x16, x14 + adcs x15, x17, x15 + adcs x16, xzr, x16 + adc x17, xzr, x17 + subs x22, x4, x5 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x9, x8 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x15, x15, x21 + eor x20, x20, x19 + adcs x16, x16, x20 + adc x17, x17, x19 + subs x22, x2, x3 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x7, x6 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x11, x11, x21 + eor x20, x20, x19 + adcs x12, x12, x20 + adcs x13, x13, x19 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x5 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x9, x7 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x14, x14, x21 + eor x20, x20, x19 + adcs x15, x15, x20 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x4 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x8, x6 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x12, x12, x21 + eor x20, x20, x19 + adcs x13, x13, x20 + adcs x14, x14, x19 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x2, x5 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x9, x6 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + subs x22, x3, x4 + cneg x22, x22, cc // cc = lo, ul, last + csetm x19, cc // cc = lo, ul, last + subs x20, x8, x7 + cneg x20, x20, cc // cc = lo, ul, last + mul x21, x22, x20 + umulh x20, x22, x20 + cinv x19, x19, cc // cc = lo, ul, last + cmn x19, #0x1 + eor x21, x21, x19 + adcs x13, x13, x21 + eor x20, x20, x19 + adcs x14, x14, x20 + adcs x15, x15, x19 + adcs x16, x16, x19 + adc x17, x17, x19 + adds x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adc x19, xzr, xzr + +// Add it back to the buffer + + ldp x2, x3, [x0, #32] + adds x10, x10, x2 + adcs x11, x11, x3 + stp x10, x11, [x0, #32] + + ldp x2, x3, [x0, #48] + adcs x12, x12, x2 + adcs x13, x13, x3 + stp x12, x13, [x0, #48] + + ldp x2, x3, [x0, #64] + adcs x14, x14, x2 + adcs x15, x15, x3 + stp x14, x15, [x0, #64] + + ldp x2, x3, [x0, #80] + adcs x16, x16, x2 + adcs x17, x17, x3 + stp x16, x17, [x0, #80] + + ldp x2, x3, [x0, #96] + adcs x2, x2, x19 + adcs x3, x3, xzr + stp x2, x3, [x0, #96] + + ldp x2, x3, [x0, #112] + adcs x2, x2, xzr + adc x3, x3, xzr + stp x2, x3, [x0, #112] + + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_8_16_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_8_16_alt.S new file mode 100644 index 00000000000..2faf94d00e4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/bignum_sqr_8_16_alt.S @@ -0,0 +1,280 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[8]; output z[16] +// +// extern void bignum_sqr_8_16_alt +// (uint64_t z[static 16], uint64_t x[static 8]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 +#define a4 x6 +#define a5 x7 +#define a6 x8 +#define a7 x9 + +#define l x10 + +#define u0 x2 // The same as a0, which is safe +#define u1 x11 +#define u2 x12 +#define u3 x13 +#define u4 x14 +#define u5 x15 +#define u6 x16 +#define u7 x17 +#define u8 x19 +#define u9 x20 +#define u10 x21 +#define u11 x22 +#define u12 x23 +#define u13 x24 +#define u14 x25 +#define u15 x26 + +S2N_BN_SYMBOL(bignum_sqr_8_16_alt): + +// It's convenient to have more registers to play with + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + +// Load all the elements as [a7;a6;a5;a4;a3;a2;a1;a0], set up an initial +// window [u8;u7;u6;u5;u4;u3;u2;u1] = 10 + 20 + 30 + 40 + 50 + 60 + 70 + + ldp a0, a1, [x] + + mul u1, a0, a1 + umulh u2, a0, a1 + + ldp a2, a3, [x, #16] + + mul l, a0, a2 + umulh u3, a0, a2 + adds u2, u2, l + + ldp a4, a5, [x, #32] + + mul l, a0, a3 + umulh u4, a0, a3 + adcs u3, u3, l + + ldp a6, a7, [x, #48] + + mul l, a0, a4 + umulh u5, a0, a4 + adcs u4, u4, l + + mul l, a0, a5 + umulh u6, a0, a5 + adcs u5, u5, l + + mul l, a0, a6 + umulh u7, a0, a6 + adcs u6, u6, l + + mul l, a0, a7 + umulh u8, a0, a7 + adcs u7, u7, l + + adc u8, u8, xzr + +// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54 + + mul l, a1, a2 + adds u3, u3, l + mul l, a1, a3 + adcs u4, u4, l + mul l, a1, a4 + adcs u5, u5, l + mul l, a1, a5 + adcs u6, u6, l + mul l, a1, a6 + adcs u7, u7, l + mul l, a1, a7 + adcs u8, u8, l + cset u9, cs + + umulh l, a1, a2 + adds u4, u4, l + umulh l, a1, a3 + adcs u5, u5, l + umulh l, a1, a4 + adcs u6, u6, l + umulh l, a1, a5 + adcs u7, u7, l + umulh l, a1, a6 + adcs u8, u8, l + umulh l, a1, a7 + adc u9, u9, l + mul l, a4, a5 + umulh u10, a4, a5 + adds u9, u9, l + adc u10, u10, xzr + +// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65 + + mul l, a2, a3 + adds u5, u5, l + mul l, a2, a4 + adcs u6, u6, l + mul l, a2, a5 + adcs u7, u7, l + mul l, a2, a6 + adcs u8, u8, l + mul l, a2, a7 + adcs u9, u9, l + mul l, a4, a6 + adcs u10, u10, l + cset u11, cs + + umulh l, a2, a3 + adds u6, u6, l + umulh l, a2, a4 + adcs u7, u7, l + umulh l, a2, a5 + adcs u8, u8, l + umulh l, a2, a6 + adcs u9, u9, l + umulh l, a2, a7 + adcs u10, u10, l + umulh l, a4, a6 + adc u11, u11, l + mul l, a5, a6 + umulh u12, a5, a6 + adds u11, u11, l + adc u12, u12, xzr + +// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76 + + mul l, a3, a4 + adds u7, u7, l + mul l, a3, a5 + adcs u8, u8, l + mul l, a3, a6 + adcs u9, u9, l + mul l, a3, a7 + adcs u10, u10, l + mul l, a4, a7 + adcs u11, u11, l + mul l, a5, a7 + adcs u12, u12, l + cset u13, cs + + umulh l, a3, a4 + adds u8, u8, l + umulh l, a3, a5 + adcs u9, u9, l + umulh l, a3, a6 + adcs u10, u10, l + umulh l, a3, a7 + adcs u11, u11, l + umulh l, a4, a7 + adcs u12, u12, l + umulh l, a5, a7 + adc u13, u13, l + mul l, a6, a7 + umulh u14, a6, a7 + adds u13, u13, l + adc u14, u14, xzr + +// Double that, with u15 holding the top carry + + adds u1, u1, u1 + adcs u2, u2, u2 + adcs u3, u3, u3 + adcs u4, u4, u4 + adcs u5, u5, u5 + adcs u6, u6, u6 + adcs u7, u7, u7 + adcs u8, u8, u8 + adcs u9, u9, u9 + adcs u10, u10, u10 + adcs u11, u11, u11 + adcs u12, u12, u12 + adcs u13, u13, u13 + adcs u14, u14, u14 + cset u15, cs + +// Add the homogeneous terms 00 + 11 + 22 + 33 + 44 + 55 + 66 + 77 + + umulh l, a0, a0 + mul u0, a0, a0 + adds u1, u1, l + + mul l, a1, a1 + adcs u2, u2, l + umulh l, a1, a1 + adcs u3, u3, l + + mul l, a2, a2 + adcs u4, u4, l + umulh l, a2, a2 + adcs u5, u5, l + + mul l, a3, a3 + adcs u6, u6, l + umulh l, a3, a3 + adcs u7, u7, l + + mul l, a4, a4 + adcs u8, u8, l + umulh l, a4, a4 + adcs u9, u9, l + + mul l, a5, a5 + adcs u10, u10, l + umulh l, a5, a5 + adcs u11, u11, l + + mul l, a6, a6 + adcs u12, u12, l + umulh l, a6, a6 + adcs u13, u13, l + + mul l, a7, a7 + adcs u14, u14, l + umulh l, a7, a7 + adc u15, u15, l + +// Store back final result + + stp u0, u1, [z] + stp u2, u3, [z, #16] + stp u4, u5, [z, #32] + stp u6, u7, [z, #48] + stp u8, u9, [z, #64] + stp u10, u11, [z, #80] + stp u12, u13, [z, #96] + stp u14, u15, [z, #112] + +// Restore registers and return + + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_emontredc_8n_base.S similarity index 90% rename from third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_emontredc_8n_base.S index 081f5de362d..41e63d1fa27 100644 --- a/third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_emontredc_8n_base.S @@ -5,7 +5,7 @@ // Extended Montgomery reduce in 8-digit blocks, results in input-output buffer // Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] // -// extern uint64_t bignum_emontredc_8n +// extern uint64_t bignum_emontredc_8n_base // (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w); // // Functionally equivalent to bignum_emontredc (see that file for more detail). @@ -15,8 +15,8 @@ // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n_base) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n_base) .text .balign 4 @@ -29,18 +29,18 @@ // --------------------------------------------------------------------------- #define muldiffnadd(b,a, c,h,l,t, x,y, w,z) \ - subs t, x, y ; \ - cneg t, t, cc ; \ - csetm c, cc ; \ - subs h, w, z ; \ - cneg h, h, cc ; \ - mul l, t, h ; \ - umulh h, t, h ; \ - cinv c, c, cc ; \ - adds xzr, c, #1 ; \ - eor l, l, c ; \ - adcs a, a, l ; \ - eor h, h, c ; \ + subs t, x, y __LF\ + cneg t, t, cc __LF\ + csetm c, cc __LF\ + subs h, w, z __LF\ + cneg h, h, cc __LF\ + mul l, t, h __LF\ + umulh h, t, h __LF\ + cinv c, c, cc __LF\ + adds xzr, c, #1 __LF\ + eor l, l, c __LF\ + adcs a, a, l __LF\ + eor h, h, c __LF\ adcs b, b, h // The inputs, though k gets processed so we use a different name @@ -196,9 +196,9 @@ // Main code // ***************************************************** -S2N_BN_SYMBOL(bignum_emontredc_8n): +S2N_BN_SYMBOL(bignum_emontredc_8n_base): -stp x19, x20, [sp, #-16]! + stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! stp x25, x26, [sp, #-16]! @@ -211,7 +211,7 @@ stp x19, x20, [sp, #-16]! lsr k4m1, x0, #2 mov i, k4m1 subs c, k4m1, #1 - bcc bignum_emontredc_8n_end + bcc bignum_emontredc_8n_base_end mov tc, xzr lsl k4m1, c, #5 @@ -219,7 +219,7 @@ stp x19, x20, [sp, #-16]! // Rather than propagating the carry to the end each time, we // stop at the "natural" end and store top carry in tc as a bitmask. -bignum_emontredc_8n_outerloop: +bignum_emontredc_8n_base_outerloop: // Load [u3;u2;u1;u0] = bottom 4 digits of the input at current window @@ -325,9 +325,8 @@ bignum_emontredc_8n_outerloop: // Repeated multiply-add block to do the k/4-1 remaining 4-digit chunks - cbz k4m1, bignum_emontredc_8n_madddone mov j, k4m1 -bignum_emontredc_8n_maddloop: +bignum_emontredc_8n_base_maddloop: add m, m, #32 add z, z, #32 @@ -335,8 +334,8 @@ bignum_emontredc_8n_maddloop: ldp b2, b3, [m, #16] madd4 subs j, j, #32 - bne bignum_emontredc_8n_maddloop -bignum_emontredc_8n_madddone: + bne bignum_emontredc_8n_base_maddloop +bignum_emontredc_8n_base_madddone: // Add the carry out to the existing z contents, propagating the // top carry tc up by 32 places as we move "leftwards". @@ -360,14 +359,14 @@ bignum_emontredc_8n_madddone: // Bump up z only and keep going add z, z, #32 - subs i, i, #1 - bne bignum_emontredc_8n_outerloop + sub i, i, #1 + cbnz i, bignum_emontredc_8n_base_outerloop // Return the top carry as 0 or 1 (it's currently a bitmask) neg x0, tc -bignum_emontredc_8n_end: +bignum_emontredc_8n_base_end: ldp x27, x28, [sp], #16 ldp x25, x26, [sp], #16 ldp x23, x24, [sp], #16 diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_emontredc_8n_cdiff_base.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_emontredc_8n_cdiff_base.S new file mode 100644 index 00000000000..41c700b1d03 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_emontredc_8n_cdiff_base.S @@ -0,0 +1,681 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC + +// ---------------------------------------------------------------------------- +// Extend Montgomery reduce in 8-digit blocks, uses an extra storage to +// temporarily cache multiplied differences appearing in ADK. +// Results are stored in input-output buffer (z). +// Inputs z[2*k], m[k], w; +// Outputs function return (extra result bit) and z[2*k] +// Temporary buffer m_precalc[12*(k/4-1)] +// +// extern uint64_t bignum_emontredc_8n_cdiff +// (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w, uint64_t *m_precalc); +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = w, X4 = m_precalc +// returns X0 +// +// This is an unoptimized version of bignum_emontredc_8n_cdiff. +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n_cdiff_base) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n_cdiff_base) + .text + .balign 4 + + // Silly SLOTHY limitation: It needs the loop counter to have the name 'count' + count .req x27 // inner loop counter + + // Semantically transparent instruction wrapper which is used by SLOTHY + // for dependency tracking through memory. SLOTHY itself has no notion of + // memory, only registers; to still track static dependencies through memory + // (register spills), a 'hint' register type is introduced (syntax t{i}, t{i}{j}) + // that's written to in store instructions and read from in the corresponding + // load instruction. + // + // The 'slothy:no-unfold' annotation prevents SLOTHY from opening the macro, + // and instead makes it treat `stph` as an instruction specified in the + // Arch and uArch models provided to it. + .macro stph a, b, addr, imm, hint // slothy:no-unfold + stp \a\(), \b\(), [\addr, \imm] + .endm + + .macro ldph a, b, addr, imm, hint // slothy:no-unfold + ldp \a\(), \b\(), [\addr, \imm] + .endm + + .macro ldrh a, addr, imm, hint // slothy:no-unfold + ldr \a\(), [\addr, \imm] + .endm + + // Helper macro for the pre-computations + .macro cdiff t, c, x, y + subs \t, \x, \y + cneg \t, \t, cc + csetm \c, cc + .endm + + // Some immediate offsets for cached differences+carry used + // in the inner ADK multiplications + #define cache_a01 (32+0*16) + #define cache_a02 (32+1*16) + #define cache_a03 (32+2*16) + #define cache_a12 (32+3*16) + #define cache_a13 (32+4*16) + #define cache_a23 (32+5*16) + #define cache_m10 (0*16) + #define cache_m20 (1*16) + #define cache_m30 (2*16) + #define cache_m21 (3*16) + #define cache_m31 (4*16) + #define cache_m32 (5*16) + + a0 .req x4 + a1 .req x5 + a2 .req x6 + a3 .req x7 + + vpre00 .req v30 + vpre01 .req v28 + vpre02 .req v17 + vpre10 .req v18 + vpre11 .req v19 + vpre12 .req v20 + + // Computes two 64x64->128-bit multiplication a*x and a*y + // v_in0: 128-bit input vector viewed as pair (x,y) of 64-bit numbers + // x_in: 64-bit common multiplicand a + // v_out0: 128-bit output vector to hold a*x + // v_out1: 128-bit output vector to hold a*y + // + // Uses temporaries as indicated in the following defines: + #define v_in0_p v3 + #define v_in0_pp v5 + #define v_in0_ppp v28 + #define v_in1 v0 + #define vtmp0 v4 + #define vtmp1 v6 + #define vtmp2 v7 + #define vtmp3 v16 + #define vtmp4 v2 + + .macro vmul_2x_64_64_128 v_in0, x_in, v_out0, v_out1 // slothy:no-unfold + dup v_in1.2d, \x_in + uzp2 v_in0_p.4s, \v_in0\().4s, \v_in0\().4s + xtn vtmp0.2s, v_in1.2d + xtn v_in0_pp.2s, \v_in0\().2d + rev64 v_in0_ppp.4s, \v_in0\().4s + umull vtmp1.2d, vtmp0.2s, v_in0_pp.2s + umull vtmp2.2d, vtmp0.2s, v_in0_p.2s + uzp2 vtmp3.4s, v_in1.4s, v_in1.4s + mul v_in1.4s, v_in0_ppp.4s, v_in1.4s + usra vtmp2.2d, vtmp1.2d, #32 + umull \v_out1\().2d, vtmp3.2s, v_in0_p.2s + uaddlp v_in1.2d, v_in1.4s + and vtmp4.16b, vtmp2.16b, v29.16b + umlal vtmp4.2d, vtmp3.2s, v_in0_pp.2s + shl \v_out0\().2d, v_in1.2d, #32 + usra \v_out1\().2d, vtmp2.2d, #32 + umlal \v_out0\().2d, vtmp0.2s, v_in0_pp.2s + usra \v_out1\().2d, vtmp4.2d, #32 + .endm + + // SLOTHY version of the above multiplication macro, using symbolic + // registers instead of hardcoded registers. This is only used during + // SLOTHY optimization (the above macro is ignored because of + // 'slothy:no-unfold'). +#if defined(SLOTHY) + .macro vmul_2x_64_64_128 v_in0, x_in, v_out0, v_out1 + dup V.2d, \x_in + uzp2 V.4s, \v_in0\().4s, \v_in0\().4s + xtn V.2s, V.2d + xtn V.2s, \v_in0\().2d + rev64 V.4s, \v_in0\().4s + umull V.2d, V.2s, V.2s + umull V.2d, V.2s, V.2s + uzp2 V.4s, V.4s, V.4s + mul V.4s, V.4s, V.4s + usra V.2d, V.2d, #32 + umull \v_out1\().2d, V.2s, V.2s + uaddlp V.2d, V.4s + and V.16b, V.16b, v29.16b + umlal V.2d, V.2s, V.2s + shl \v_out0\().2d, V.2d, #32 + usra \v_out1\().2d, V.2d, #32 + umlal \v_out0\().2d, V.2s, V.2s + usra \v_out1\().2d, V.2d, #32 + .endm +#endif + +S2N_BN_SYMBOL(bignum_emontredc_8n_cdiff_base): + + sub sp, sp, #(6*16) + stp x19, x20, [sp, #(5*16)] + stp x21, x22, [sp, #(4*16)] + stp x23, x24, [sp, #(3*16)] + stp x25, x26, [sp, #(2*16)] + stp x27, x28, [sp, #(1*16)] + stp x29, x30, [sp, #(0*16)] + + // Leave space for cached differences in inner loop + sub sp, sp, #(6*16) + + sub sp, sp, #32 + lsr x0, x0, #2 + mov x26, x0 + subs x12, x0, #1 + bcc bignum_emontredc_8n_cdiff_base_end + + // x30 = buffer holding precomputed ADK carry-differences for modulus + mov w30, #(12*8) + mul w30, w12, w30 + sub x30, sp, x30 + + // + // Start of precomputation + // + // Precompute and cache signed differences of modulus components + // used in the ADK multiplication in the inner loop. + // + // THIS SHOULD BE HOISTED OUT + // (and until then, comment out for benchmarking to get accurate estimates) + // + + // Number of extra limbs required: + // 6 * (number of limbs / 4 - 1) * 2 = 12 * (number_of_limbs/4 - 1) + // + // For now, just put them on the stack + mov sp, x30 + + // Save modulus pointer + mov x25, x2 + + mov count, x12 +bignum_emontredc_8n_cdiff_base_precomp: + ldp a0, a1, [x2, #32]! + ldp a2, a3, [x2, #16] + + t .req x28 + c .req x29 + + cdiff t, c, a1, a0 + stp t, c, [sp, #cache_m10] + cdiff t, c, a2, a0 + stp t, c, [sp, #cache_m20] + cdiff t, c, a3, a0 + stp t, c, [sp, #cache_m30] + cdiff t, c, a2, a1 + stp t, c, [sp, #cache_m21] + cdiff t, c, a3, a1 + stp t, c, [sp, #cache_m31] + cdiff t, c, a3, a2 + stp t, c, [sp, #cache_m32] + + add sp, sp, #(6*16) + + subs count, count, #1 + cbnz count, bignum_emontredc_8n_cdiff_base_precomp + + // Set modulus pointer back to its original value + mov x2, x25 + + // + // End of precomputation + // + + stp x3, x30, [sp] + //stp x3, xzr, [sp] + stp x26, xzr, [sp, #16] + mov x28, xzr + lsl x0, x12, #5 + + movi v29.2d, #0x000000ffffffff + +bignum_emontredc_8n_cdiff_base_outerloop: + ldr x3, [sp] + ldph x17, x19, x1, #0, t0 + ldph x20, x21, x1, #16, t1 + ldp x8, x9, [x2, #0] + ldp x10, x11, [x2, #16] + ldr q21, [x2, #16] + + // Montgomery step 0 + + mul x4, x17, x3 + // NEON: Calculate x4 * (x10, x11) that does two 64x64->128-bit multiplications. + vmul_2x_64_64_128 v21, x4, v0, v1 + mov x14, v0.d[0] + mov x15, v0.d[1] + mul x12, x4, x8 + adds x17, x17, x12 + umulh x12, x4, x8 + mul x13, x4, x9 + adcs x19, x19, x13 + umulh x13, x4, x9 + adcs x20, x20, x14 + adcs x21, x21, x15 + mov x14, v1.d[0] + mov x15, v1.d[1] + adc x22, xzr, xzr + adds x19, x19, x12 + adcs x20, x20, x13 + adcs x21, x21, x14 + adc x22, x22, x15 + + // Montgomery step 1 + + mul x5, x19, x3 + // NEON: Calculate x5 * (x10, x11) that does two 64x64->128-bit multiplications. + vmul_2x_64_64_128 v21, x5, v0, v1 + mov x14, v0.d[0] + mov x15, v0.d[1] + mul x12, x5, x8 + adds x19, x19, x12 + umulh x12, x5, x8 + mul x13, x5, x9 + adcs x20, x20, x13 + umulh x13, x5, x9 + adcs x21, x21, x14 + adcs x22, x22, x15 + mov x14, v1.d[0] + mov x15, v1.d[1] + adc x23, xzr, xzr + adds x20, x20, x12 + adcs x21, x21, x13 + adcs x22, x22, x14 + adc x23, x23, x15 + stph x4, x5, x1, #0, t0 + + // Montgomery step 2 + + mul x6, x20, x3 + // NEON: Calculate x6 * (x10, x11) that does two 64x64->128-bit multiplications. + vmul_2x_64_64_128 v21, x6, v21, v1 + mov x14, v21.d[0] + mov x15, v21.d[1] + mul x12, x6, x8 + adds x20, x20, x12 + umulh x12, x6, x8 + mul x13, x6, x9 + adcs x21, x21, x13 + umulh x13, x6, x9 + adcs x22, x22, x14 + adcs x23, x23, x15 + mov x14, v1.d[0] + mov x15, v1.d[1] + adc x24, xzr, xzr + adds x21, x21, x12 + mul x7, x21, x3 + adcs x22, x22, x13 + adcs x23, x23, x14 + adc x24, x24, x15 + + stph x6, x7, x1, #16, t1 + + // Montgomery step 3 + + mul x12, x7, x8 + mul x13, x7, x9 + mul x14, x7, x10 + mul x15, x7, x11 + adds x21, x21, x12 + umulh x12, x7, x8 + adcs x22, x22, x13 + umulh x13, x7, x9 + adcs x23, x23, x14 + umulh x14, x7, x10 + adcs x24, x24, x15 + umulh x15, x7, x11 + adc x25, xzr, xzr + adds x12, x22, x12 + adcs x13, x23, x13 + adcs x14, x24, x14 + adc x15, x25, x15 + + lsr count, x0, #5 + + ldrh q20, x1, #0, t0 + ldrh q21, x1, #16, t1 + + // Precompute and cache differences required in the + // ADK multiplication conducted by the innerl oop. + // Save each difference (somewhat inefficiently) + // as a pair (t,c) of 64-bit + carry. + // + // The same caching trick is applied to the modulus, + // for which the various differences can even be hoisted + // out of the entire multiplication routine. + + // a0 - a1 with carry + cdiff x16,x26,a0,a1 + stph x16, x26, sp, #cache_a01, t01 + // a0 - a2 with carry + cdiff x16,x26,a0,a2 + stph x16, x26, sp, #cache_a02, t02 + // a0 - a3 with carry + cdiff x16,x26,a0,a3 + stph x16, x26, sp, #cache_a03, t03 + // a1 - a2 with carry + cdiff x16,x26,a1,a2 + stph x16, x26, sp, #cache_a12, t12 + // a1 - a3 with carry + cdiff x16,x26,a1,a3 + stph x16, x26, sp, #cache_a13, t13 + // a2 - a3 with carry + cdiff x16,x26,a2,a3 + stph x16, x26, sp, #cache_a23, t23 + + // Precompute and cache some precomputations for + // the Neon multiplications in the inner loop + uzp2 vpre00.4s, v20.4s, v20.4s + xtn vpre01.2s, v20.2d + rev64 vpre02.4s, v20.4s + uzp2 vpre10.4s, v21.4s, v21.4s + xtn vpre11.2s, v21.2d + rev64 vpre12.4s, v21.4s + +bignum_emontredc_8n_cdiff_base_maddloop_neon: + + ldr q22, [x2, #32]! + ldr q23, [x2, #16] + + xtn v4.2s, v22.2d + umull v6.2d, v4.2s, vpre01.2s + umull v7.2d, v4.2s, vpre00.2s + uzp2 v16.4s, v22.4s, v22.4s + mul v0.4s, vpre02.4s, v22.4s + usra v7.2d, v6.2d, #32 + umull v25.2d, v16.2s, vpre00.2s + uaddlp v0.2d, v0.4s + and v2.16b, v7.16b, v29.16b + umlal v2.2d, v16.2s, vpre01.2s + shl v24.2d, v0.2d, #32 + usra v25.2d, v7.2d, #32 + umlal v24.2d, v4.2s, vpre01.2s + usra v25.2d, v2.2d, #32 + + // Original version without caching + // uzp2 v3.4s, v22.4s, v22.4s + // xtn v4.2s, v20.2d + // xtn v5.2s, v22.2d + // rev64 v1.4s, v22.4s + // umull v6.2d, v4.2s, v5.2s + // umull v7.2d, v4.2s, v3.2s + // uzp2 v16.4s, v20.4s, v20.4s + // mul v0.4s, v1.4s, v20.4s + // usra v7.2d, v6.2d, #32 + // umull v25.2d, v16.2s, v3.2s + // uaddlp v0.2d, v0.4s + // and v2.16b, v7.16b, v29.16b + // umlal v2.2d, v16.2s, v5.2s + // shl v24.2d, v0.2d, #32 + // usra v25.2d, v7.2d, #32 + // umlal v24.2d, v4.2s, v5.2s + // usra v25.2d, v2.2d, #32 + + xtn v4.2s, v23.2d + umull v6.2d, v4.2s, vpre11.2s + umull v7.2d, v4.2s, vpre10.2s + uzp2 v16.4s, v23.4s, v23.4s + mul v0.4s, vpre12.4s, v23.4s + usra v7.2d, v6.2d, #32 + umull v27.2d, v16.2s, vpre10.2s + uaddlp v0.2d, v0.4s + and v2.16b, v7.16b, v29.16b + umlal v2.2d, v16.2s, vpre11.2s + shl v26.2d, v0.2d, #32 + usra v27.2d, v7.2d, #32 + umlal v26.2d, v4.2s, vpre11.2s + usra v27.2d, v2.2d, #32 + + // Original version without caching + // uzp2 v3.4s, v23.4s, v23.4s + // xtn v4.2s, v21.2d + // xtn v5.2s, v23.2d + // rev64 v1.4s, v23.4s + // umull v6.2d, v4.2s, v5.2s + // umull v7.2d, v4.2s, v3.2s + // uzp2 v16.4s, v21.4s, v21.4s + // mul v0.4s, v1.4s, v21.4s + // usra v7.2d, v6.2d, #32 + // umull v27.2d, v16.2s, v3.2s + // uaddlp v0.2d, v0.4s + // and v2.16b, v7.16b, v29.16b + // umlal v2.2d, v16.2s, v5.2s + // shl v26.2d, v0.2d, #32 + // usra v27.2d, v7.2d, #32 + // umlal v26.2d, v4.2s, v5.2s + // usra v27.2d, v2.2d, #32 + + mov x16, v25.d[0] // hi bits of (x4 * x8) + mov x26, v27.d[0] // hi bits of (x6 * x10) + mov x3, v25.d[1] // hi bits of (x5 * x9) + mov x17, v27.d[1] // hi bits of (x6 * x10) + + mov x20, v24.d[1] // lo bits of (x5 * x9) + mov x21, v26.d[0] // lo bits of (x6 * x10) + mov x24, v26.d[1] // lo bits of (x7 * x11) + + // Not necessary if one uses cached differences for the modulus + //ldp x8, x9, [x2, #0] + //ldp x10, x11, [x2, #16] + + adds x22, x20, x16 + adcs x23, x21, x3 + adcs x24, x24, x26 + adc x25, x17, xzr + mov x17, v24.d[0] // lo bits of (x4 * x8) + ldp x20, x21, [x1, #32]! + adds x12, x12, x20 + adcs x13, x13, x21 + ldp x20, x21, [x1, #16] + adcs x14, x14, x20 + adcs x15, x15, x21 + adc x16, xzr, xzr + adds x19, x22, x17 + adcs x22, x23, x22 + adcs x23, x24, x23 + adcs x24, x25, x24 + adc x25, xzr, x25 + adds x20, x22, x17 + adcs x21, x23, x19 + adcs x22, x24, x22 + adcs x23, x25, x23 + adcs x24, xzr, x24 + adc x25, xzr, x25 + adds x17, x17, x12 + adcs x19, x19, x13 + adcs x20, x20, x14 + adcs x21, x21, x15 + adcs x22, x22, x16 + adcs x23, x23, xzr + adcs x24, x24, xzr + adc x25, x25, xzr + + ldph x15, x12, sp, #cache_a23, t23 + // Original code without caching + //subs x15, x6, x7 + //cneg x15, x15, cc + //csetm x12, cc + + ldp x13, x14, [x30, #cache_m32] + eor x12, x12, x14 + // Original code without caching + //cdiff x13, x14, x11, x10 + //subs x13, x11, x10 + //cneg x13, x13, cc + + mul x14, x15, x13 + umulh x13, x15, x13 + adds xzr, x12, #1 + eor x14, x14, x12 + adcs x23, x23, x14 + eor x13, x13, x12 + adcs x24, x24, x13 + adc x25, x25, x12 + + ldph x15, x12, sp, #cache_a01, t01 + //subs x15, x4, x5 + //cneg x15, x15, cc + //csetm x12, cc + + ldp x13, x14, [x30, #cache_m10] + eor x12, x12, x14 + // Original code without caching + //subs x13, x9, x8 + //cneg x13, x13, cc + //cinv x12, x12, cc + + mul x14, x15, x13 + umulh x13, x15, x13 + adds xzr, x12, #1 + eor x14, x14, x12 + adcs x19, x19, x14 + eor x13, x13, x12 + adcs x20, x20, x13 + adcs x21, x21, x12 + adcs x22, x22, x12 + adcs x23, x23, x12 + adcs x24, x24, x12 + adc x25, x25, x12 + + stp x17, x19, [x1, #0] + + ldph x15, x12, sp, #cache_a13, t13 + //subs x15, x5, x7 + //cneg x15, x15, cc + //csetm x12, cc + + ldp x13, x14, [x30, #cache_m31] + eor x12, x12, x14 + // Original code without caching + //subs x13, x11, x9 + //cneg x13, x13, cc + //cinv x12, x12, cc + + mul x14, x15, x13 + umulh x13, x15, x13 + adds xzr, x12, #1 + eor x14, x14, x12 + adcs x22, x22, x14 + eor x13, x13, x12 + adcs x23, x23, x13 + adcs x24, x24, x12 + adc x25, x25, x12 + + ldph x15, x12, sp, #cache_a02, t02 + //subs x15, x4, x6 + //cneg x15, x15, cc + //csetm x12, cc + + ldp x13, x14, [x30, #cache_m20] + eor x12, x12, x14 + // Original code without caching + //subs x13, x10, x8 + //cneg x13, x13, cc + //cinv x12, x12, cc + + mul x14, x15, x13 + umulh x13, x15, x13 + adds xzr, x12, #1 + eor x14, x14, x12 + adcs x20, x20, x14 + eor x13, x13, x12 + adcs x21, x21, x13 + adcs x22, x22, x12 + adcs x23, x23, x12 + adcs x24, x24, x12 + adc x25, x25, x12 + + ldph x15, x12, sp, #cache_a03, t03 + //subs x15, x4, x7 + //cneg x15, x15, cc + //csetm x12, cc + + ldp x13, x14, [x30, #cache_m30] + eor x12, x12, x14 + // Original code without caching + //subs x13, x11, x8 + //cneg x13, x13, cc + //cinv x12, x12, cc + + mul x14, x15, x13 + umulh x13, x15, x13 + adds xzr, x12, #1 + eor x14, x14, x12 + adcs x21, x21, x14 + eor x13, x13, x12 + adcs x22, x22, x13 + adcs x23, x23, x12 + adcs x24, x24, x12 + adc x25, x25, x12 + + ldph x15, x12, sp, #cache_a12, t12 + //subs x15, x5, x6 + //cneg x15, x15, cc + //csetm x12, cc + + ldp x13, x14, [x30, #cache_m21] + eor x12, x12, x14 + // Original code without caching + //subs x13, x10, x9 + //cneg x13, x13, cc + //cinv x12, x12, cc + + mul x14, x15, x13 + umulh x13, x15, x13 + adds xzr, x12, #1 + eor x14, x14, x12 + adcs x21, x21, x14 + + stp x20, x21, [x1, #16] + eor x13, x13, x12 + adcs x22, x22, x13 + adcs x13, x23, x12 + adcs x14, x24, x12 + adc x15, x25, x12 + mov x12, x22 + + add x30, x30, #96 + + sub count, count, #1 + cbnz count, bignum_emontredc_8n_cdiff_base_maddloop_neon + + ldp x17, x19, [x1, #32] + ldp x20, x21, [x1, #48] + ldp x26, xzr, [sp, #16] + adds xzr, x28, x28 + adcs x17, x17, x12 + adcs x19, x19, x13 + adcs x20, x20, x14 + adcs x21, x21, x15 + csetm x28, cs + stp x17, x19, [x1, #32] + stp x20, x21, [x1, #48] + sub x1, x1, x0 + sub x2, x2, x0 + add x1, x1, #32 + subs x26, x26, #1 + stp x26, xzr, [sp, #16] + + // Restore buffer base for cached modulus differences + ldr x30, [sp, #8] + + bne bignum_emontredc_8n_cdiff_base_outerloop + neg x0, x28 + +bignum_emontredc_8n_cdiff_base_end: + add sp, sp, #32 + add sp, sp, #(6*16) + + ldp x29, x30, [sp, #(0*16)] + ldp x27, x28, [sp, #(1*16)] + ldp x25, x26, [sp, #(2*16)] + ldp x23, x24, [sp, #(3*16)] + ldp x21, x22, [sp, #(4*16)] + ldp x19, x20, [sp, #(5*16)] + add sp, sp, #(6*16) + + ret + diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_mul_8_16_base.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_mul_8_16_base.S new file mode 100644 index 00000000000..d87eb806bc5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_mul_8_16_base.S @@ -0,0 +1,349 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[8], y[8]; output z[16] +// +// extern void bignum_mul_8_16_base +// (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_base) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_base) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Macro computing [c,b,a] := [b,a] + (x - y) * (w - z), adding with carry +// to the [b,a] components but leaving CF aligned with the c term, which is +// a sign bitmask for (x - y) * (w - z). Continued add-with-carry operations +// with [c,...,c] will continue the carry chain correctly starting from +// the c position if desired to add to a longer term of the form [...,b,a]. +// +// c,h,l,t should all be different and t,h should not overlap w,z. +// --------------------------------------------------------------------------- + +.macro muldiffnadd b,a, c,h,l,t, x,y, w,z + subs \t, \x, \y + cneg \t, \t, cc + csetm \c, cc + subs \h, \w, \z + cneg \h, \h, cc + mul \l, \t, \h + umulh \h, \t, \h + cinv \c, \c, cc + adds xzr, \c, #1 + eor \l, \l, \c + adcs \a, \a, \l + eor \h, \h, \c + adcs \b, \b, \h +.endm + +#define z x0 +#define x x1 +#define y x2 + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 +#define b0 x7 +#define b1 x8 +#define b2 x9 +#define b3 x10 + +#define s0 x11 +#define s1 x12 +#define s2 x13 +#define s3 x14 +#define s4 x15 +#define s5 x16 +#define s6 x17 +#define s7 x19 + +#define c x20 +#define h x21 +#define l x22 +#define m x23 +#define t x24 + +// These alias the ax and bx values, and are only used when they are done with + +#define u0 x3 +#define u1 x4 +#define u2 x5 +#define u3 x6 +#define u4 x7 +#define u5 x8 +#define u6 x9 +#define u7 x10 + +// These alias c,h,l,m but leave s, t and d safe, all we need + +#define u8 x20 +#define u9 x21 +#define u10 x22 +#define u11 x23 + +// We recycle the input pointers near the end + +#define s x1 +#define d x2 + +// --------------------------------------------------------------------------- +// Core 4x4->8 ADK multiplication macro +// Does [s7,s6,s5,s4,s3,s2,s1,s0] = [a3,a2,a1,a0] * [b3,b2,b1,b0] +// +// If the input parameter is 1, it also adds in [z+32,z+40,z+48,z+56] +// existing contents; if the parameter is 0 it just does the pure multiply +// --------------------------------------------------------------------------- + +.macro mul4 afl + +// First accumulate all the "simple" products as [s7,s6,s5,s4,s0] + + mul s0, a0, b0 + mul s4, a1, b1 + mul s5, a2, b2 + mul s6, a3, b3 + + umulh s7, a0, b0 + adds s4, s4, s7 + umulh s7, a1, b1 + adcs s5, s5, s7 + umulh s7, a2, b2 + adcs s6, s6, s7 + umulh s7, a3, b3 + adc s7, s7, xzr + +// Multiply by B + 1 to get [s7;s6;s5;s4;s1;s0] + + adds s1, s4, s0 + adcs s4, s5, s4 + adcs s5, s6, s5 + adcs s6, s7, s6 + adc s7, xzr, s7 + +// Multiply by B^2 + 1 to get [s7;s6;s5;s4;s3;s2;s1;s0] + + adds s2, s4, s0 + adcs s3, s5, s1 + adcs s4, s6, s4 + adcs s5, s7, s5 + adcs s6, xzr, s6 + adc s7, xzr, s7 + +// Optionally add the existing z contents + +.rep \afl + ldp l, h, [z,#32] + adds s0, s0, l + adcs s1, s1, h + ldp l, h, [z,#48] + adcs s2, s2, l + adcs s3, s3, h + adcs s4, s4, xzr + adcs s5, s5, xzr + adcs s6, s6, xzr + adc s7, s7, xzr +.endr + +// Now add in all the "complicated" terms. + + muldiffnadd s6,s5, c,h,l,t, a2,a3, b3,b2 + adc s7, s7, c + + muldiffnadd s2,s1, c,h,l,t, a0,a1, b1,b0 + adcs s3, s3, c + adcs s4, s4, c + adcs s5, s5, c + adcs s6, s6, c + adc s7, s7, c + + muldiffnadd s5,s4, c,h,l,t, a1,a3, b3,b1 + adcs s6, s6, c + adc s7, s7, c + + muldiffnadd s3,s2, c,h,l,t, a0,a2, b2,b0 + adcs s4, s4, c + adcs s5, s5, c + adcs s6, s6, c + adc s7, s7, c + + muldiffnadd s4,s3, c,h,l,t, a0,a3, b3,b0 + adcs s5, s5, c + adcs s6, s6, c + adc s7, s7, c + muldiffnadd s4,s3, c,h,l,t, a1,a2, b2,b1 + adcs s5, s5, c + adcs s6, s6, c + adc s7, s7, c +.endm + +// --------------------------------------------------------------------------- +// The main code +// --------------------------------------------------------------------------- + +S2N_BN_SYMBOL(bignum_mul_8_16_base): + +// Save registers + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + +// Multiply the low halves and then the high halves using ADK 4x4->8. +// For the second one add the top of the low part (Q1) already into +// the bottom of the high part (Q2) so that is already dealt with. +// +// Write back the first one but defer the second till a bit later while +// we get on with the absolute difference computations + + ldp a0, a1, [x] + ldp b0, b1, [y] + ldp a2, a3, [x, #16] + ldp b2, b3, [y, #16] + + mul4 0 + + ldp a0, a1, [x, #32] + stp s0, s1, [z] + ldp b0, b1, [y, #32] + stp s2, s3, [z, #16] + ldp a2, a3, [x, #48] + stp s4, s5, [z, #32] + ldp b2, b3, [y, #48] + stp s6, s7, [z, #48] + + mul4 1 + +// Compute t,[a3,a2,a1,a0] = x_hi - x_lo +// and s,[b3,b2,b1,b0] = y_lo - y_hi +// sign-magnitude differences, and scatter in belated high writeback + + ldp l, h, [x] + subs a0, a0, l + sbcs a1, a1, h + ldp l, h, [x, #16] + sbcs a2, a2, l + sbcs a3, a3, h + csetm t, cc + + stp s0, s1, [z, #64] + + ldp l, h, [y] + subs b0, l, b0 + sbcs b1, h, b1 + ldp l, h, [y, #16] + sbcs b2, l, b2 + sbcs b3, h, b3 + csetm s, cc + + stp s2, s3, [z, #80] + + eor a0, a0, t + subs a0, a0, t + eor a1, a1, t + sbcs a1, a1, t + eor a2, a2, t + sbcs a2, a2, t + eor a3, a3, t + sbc a3, a3, t + + stp s4, s5, [z, #96] + + eor b0, b0, s + subs b0, b0, s + eor b1, b1, s + sbcs b1, b1, s + eor b2, b2, s + sbcs b2, b2, s + eor b3, b3, s + sbc b3, b3, s + + stp s6, s7, [z, #112] + +// Save the correct sign for the sub-product + + eor s, s, t + +// Now yet another 4x4->8 ADK core, but not writing back, keeping s0..s7 + + mul4 0 + +// Now accumulate the positive mid-terms as [u7,u6,u5,u4,u3.u2,u1,u0] + + ldp u0, u1, [z] + ldp u4, u5, [z,#64] + adds u0, u0, u4 + adcs u1, u1, u5 + ldp u2, u3, [z,#16] + ldp u6, u7, [z,#80] + adcs u2, u2, u6 + adcs u3, u3, u7 + ldp u8, u9, [z,#96] + adcs u4, u4, u8 + adcs u5, u5, u9 + ldp u10, u11, [z,#112] + adcs u6, u6, u10 + adcs u7, u7, u11 + +// Stop the carry here so we can reintroduce it, taking into account the +// effective addition of s from sign-extension below. Note that we get +// a duplicated word c+carry beyond the first one, so this upper part is +// of the form [d,d,d,t]. + + adcs t, s, xzr + adc d, s, xzr + +// Add in the sign-adjusted complex term + + adds xzr, s, #1 + eor s0, s0, s + adcs u0, s0, u0 + eor s1, s1, s + adcs u1, s1, u1 + eor s2, s2, s + adcs u2, s2, u2 + eor s3, s3, s + adcs u3, s3, u3 + eor s4, s4, s + adcs u4, s4, u4 + eor s5, s5, s + adcs u5, s5, u5 + eor s6, s6, s + adcs u6, s6, u6 + eor s7, s7, s + adcs u7, s7, u7 + +// From this point on replace the sign with the suspended carry indication + + adcs u8, u8, t + adcs u9, u9, d + adcs u10, u10, d + adc u11, u11, d + +// Store it back + + stp u0, u1, [z,#32] + stp u2, u3, [z,#48] + stp u4, u5, [z,#64] + stp u6, u7, [z,#80] + stp u8, u9, [z,#96] + stp u10, u11, [z,#112] + +// Restore regs and return + + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_sqr_8_16_base.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_sqr_8_16_base.S new file mode 100644 index 00000000000..3cd6ecf68bc --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/fastmul/unopt/bignum_sqr_8_16_base.S @@ -0,0 +1,356 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[8]; output z[16] +// +// extern void bignum_sqr_8_16_base (uint64_t z[static 16], uint64_t x[static 8]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_base) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_base) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Macro computing [c,b,a] := [b,a] + (x - y) * (w - z), adding with carry +// to the [b,a] components but leaving CF aligned with the c term, which is +// a sign bitmask for (x - y) * (w - z). Continued add-with-carry operations +// with [c,...,c] will continue the carry chain correctly starting from +// the c position if desired to add to a longer term of the form [...,b,a]. +// +// c,h,l,t should all be different and t,h should not overlap w,z. +// --------------------------------------------------------------------------- + +.macro muldiffnadd b,a, c,h,l,t, x,y, w,z + subs \t, \x, \y + cneg \t, \t, cc + csetm \c, cc + subs \h, \w, \z + cneg \h, \h, cc + mul \l, \t, \h + umulh \h, \t, \h + cinv \c, \c, cc + adds xzr, \c, #1 + eor \l, \l, \c + adcs \a, \a, \l + eor \h, \h, \c + adcs \b, \b, \h +.endm + +#define z x0 +#define x x1 + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 +#define b0 x6 +#define b1 x7 +#define b2 x8 +#define b3 x9 + +#define s0 x10 +#define s1 x11 +#define s2 x12 +#define s3 x13 +#define s4 x14 +#define s5 x15 +#define s6 x16 +#define s7 x17 + +#define c x19 +#define h x20 +#define l x21 +#define t x22 + +// --------------------------------------------------------------------------- +// Core 4x4->8 ADK multiplication macro +// Does [s7,s6,s5,s4,s3,s2,s1,s0] = [a3,a2,a1,a0] * [b3,b2,b1,b0] +// --------------------------------------------------------------------------- + +.macro mul4 + +// First accumulate all the "simple" products as [s7,s6,s5,s4,s0] + + mul s0, a0, b0 + mul s4, a1, b1 + mul s5, a2, b2 + mul s6, a3, b3 + + umulh s7, a0, b0 + adds s4, s4, s7 + umulh s7, a1, b1 + adcs s5, s5, s7 + umulh s7, a2, b2 + adcs s6, s6, s7 + umulh s7, a3, b3 + adc s7, s7, xzr + +// Multiply by B + 1 to get [s7;s6;s5;s4;s1;s0] + + adds s1, s4, s0 + adcs s4, s5, s4 + adcs s5, s6, s5 + adcs s6, s7, s6 + adc s7, xzr, s7 + +// Multiply by B^2 + 1 to get [s7;s6;s5;s4;s3;s2;s1;s0] + + adds s2, s4, s0 + adcs s3, s5, s1 + adcs s4, s6, s4 + adcs s5, s7, s5 + adcs s6, xzr, s6 + adc s7, xzr, s7 + +// Now add in all the "complicated" terms. + + muldiffnadd s6,s5, c,h,l,t, a2,a3, b3,b2 + adc s7, s7, c + + muldiffnadd s2,s1, c,h,l,t, a0,a1, b1,b0 + adcs s3, s3, c + adcs s4, s4, c + adcs s5, s5, c + adcs s6, s6, c + adc s7, s7, c + + muldiffnadd s5,s4, c,h,l,t, a1,a3, b3,b1 + adcs s6, s6, c + adc s7, s7, c + + muldiffnadd s3,s2, c,h,l,t, a0,a2, b2,b0 + adcs s4, s4, c + adcs s5, s5, c + adcs s6, s6, c + adc s7, s7, c + + muldiffnadd s4,s3, c,h,l,t, a0,a3, b3,b0 + adcs s5, s5, c + adcs s6, s6, c + adc s7, s7, c + muldiffnadd s4,s3, c,h,l,t, a1,a2, b2,b1 + adcs s5, s5, c + adcs s6, s6, c + adc s7, s7, c +.endm + +// --------------------------------------------------------------------------- +// The main code +// --------------------------------------------------------------------------- + +S2N_BN_SYMBOL(bignum_sqr_8_16_base): + +// Save registers + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + +// Load all the inputs first + + ldp a0, a1, [x] + ldp a2, a3, [x, #16] + ldp b0, b1, [x, #32] + ldp b2, b3, [x, #48] + +// Square the lower half with a near-clone of bignum_sqr_4_8 + + mul x17, x2, x4 + mul x14, x3, x5 + umulh x20, x2, x4 + subs x21, x2, x3 + cneg x21, x21, cc + csetm x11, cc + subs x12, x5, x4 + cneg x12, x12, cc + mul x13, x21, x12 + umulh x12, x21, x12 + cinv x11, x11, cc + eor x13, x13, x11 + eor x12, x12, x11 + adds x19, x17, x20 + adc x20, x20, xzr + umulh x21, x3, x5 + adds x19, x19, x14 + adcs x20, x20, x21 + adc x21, x21, xzr + adds x20, x20, x14 + adc x21, x21, xzr + cmn x11, #0x1 + adcs x19, x19, x13 + adcs x20, x20, x12 + adc x21, x21, x11 + adds x17, x17, x17 + adcs x19, x19, x19 + adcs x20, x20, x20 + adcs x21, x21, x21 + adc x10, xzr, xzr + mul x12, x2, x2 + mul x13, x3, x3 + mul x15, x2, x3 + umulh x11, x2, x2 + umulh x14, x3, x3 + umulh x16, x2, x3 + adds x11, x11, x15 + adcs x13, x13, x16 + adc x14, x14, xzr + adds x11, x11, x15 + adcs x13, x13, x16 + adc x14, x14, xzr + stp x12, x11, [z] + adds x17, x17, x13 + adcs x19, x19, x14 + adcs x20, x20, xzr + adcs x21, x21, xzr + adc x10, x10, xzr + stp x17, x19, [z, #16] + mul x12, x4, x4 + mul x13, x5, x5 + mul x15, x4, x5 + umulh x11, x4, x4 + umulh x14, x5, x5 + umulh x16, x4, x5 + adds x11, x11, x15 + adcs x13, x13, x16 + adc x14, x14, xzr + adds x11, x11, x15 + adcs x13, x13, x16 + adc x14, x14, xzr + adds x12, x12, x20 + adcs x11, x11, x21 + stp x12, x11, [z, #32] + adcs x13, x13, x10 + adc x14, x14, xzr + stp x13, x14, [z, #48] + +// Square the upper half with a slight variant of the previous block + + mul x17, x6, x8 + mul x14, x7, x9 + umulh x20, x6, x8 + subs x21, x6, x7 + cneg x21, x21, cc + csetm x11, cc + subs x12, x9, x8 + cneg x12, x12, cc + mul x13, x21, x12 + umulh x12, x21, x12 + cinv x11, x11, cc + eor x13, x13, x11 + eor x12, x12, x11 + adds x19, x17, x20 + adc x20, x20, xzr + umulh x21, x7, x9 + adds x19, x19, x14 + adcs x20, x20, x21 + adc x21, x21, xzr + adds x20, x20, x14 + adc x21, x21, xzr + cmn x11, #0x1 + adcs x19, x19, x13 + adcs x20, x20, x12 + adc x21, x21, x11 + adds x17, x17, x17 + adcs x19, x19, x19 + adcs x20, x20, x20 + adcs x21, x21, x21 + adc x10, xzr, xzr + mul x12, x6, x6 + mul x13, x7, x7 + mul x15, x6, x7 + umulh x11, x6, x6 + umulh x14, x7, x7 + umulh x16, x6, x7 + adds x11, x11, x15 + adcs x13, x13, x16 + adc x14, x14, xzr + adds x11, x11, x15 + adcs x13, x13, x16 + adc x14, x14, xzr + stp x12, x11, [z, #64] + adds x17, x17, x13 + adcs x19, x19, x14 + adcs x20, x20, xzr + adcs x21, x21, xzr + adc x10, x10, xzr + stp x17, x19, [z, #80] + mul x12, x8, x8 + mul x13, x9, x9 + mul x15, x8, x9 + umulh x11, x8, x8 + umulh x14, x9, x9 + umulh x16, x8, x9 + adds x11, x11, x15 + adcs x13, x13, x16 + adc x14, x14, xzr + adds x11, x11, x15 + adcs x13, x13, x16 + adc x14, x14, xzr + adds x12, x12, x20 + adcs x11, x11, x21 + stp x12, x11, [z, #96] + adcs x13, x13, x10 + adc x14, x14, xzr + stp x13, x14, [z, #112] + +// Now get the cross-product in [s7,...,s0] and double it as [c,s7,...,s0] + + mul4 + + adds s0, s0, s0 + adcs s1, s1, s1 + adcs s2, s2, s2 + adcs s3, s3, s3 + adcs s4, s4, s4 + adcs s5, s5, s5 + adcs s6, s6, s6 + adcs s7, s7, s7 + adc c, xzr, xzr + +// Add it back to the buffer + + ldp a0, a1, [z, #32] + adds s0, s0, a0 + adcs s1, s1, a1 + stp s0, s1, [z, #32] + + ldp a0, a1, [z, #48] + adcs s2, s2, a0 + adcs s3, s3, a1 + stp s2, s3, [z, #48] + + ldp a0, a1, [z, #64] + adcs s4, s4, a0 + adcs s5, s5, a1 + stp s4, s5, [z, #64] + + ldp a0, a1, [z, #80] + adcs s6, s6, a0 + adcs s7, s7, a1 + stp s6, s7, [z, #80] + + ldp a0, a1, [z, #96] + adcs a0, a0, c + adcs a1, a1, xzr + stp a0, a1, [z, #96] + + ldp a0, a1, [z, #112] + adcs a0, a0, xzr + adc a1, a1, xzr + stp a0, a1, [z, #112] + +// Restore regs and return + + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/Makefile new file mode 100644 index 00000000000..c81b9239471 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/Makefile @@ -0,0 +1,103 @@ +############################################################################# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +############################################################################# + +# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise +# use a cross-assembling version so that the code can still be assembled +# and the proofs checked against the object files (though you won't be able +# to run code without additional emulation infrastructure). The aarch64 +# cross-assembling version can be installed manually by something like: +# +# sudo apt-get install binutils-aarch64-linux-gnu + +UNAME_RESULT=$(shell uname -p) + +ifeq ($(UNAME_RESULT),aarch64) +GAS=as +else +GAS=aarch64-linux-gnu-as +endif + +# List of object files + +OBJ = bignum_add.o \ + bignum_amontifier.o \ + bignum_amontmul.o \ + bignum_amontredc.o \ + bignum_amontsqr.o \ + bignum_bitfield.o \ + bignum_bitsize.o \ + bignum_cdiv.o \ + bignum_cdiv_exact.o \ + bignum_cld.o \ + bignum_clz.o \ + bignum_cmadd.o \ + bignum_cmnegadd.o \ + bignum_cmod.o \ + bignum_cmul.o \ + bignum_coprime.o \ + bignum_copy.o \ + bignum_copy_row_from_table.o \ + bignum_copy_row_from_table_8n.o \ + bignum_copy_row_from_table_16.o \ + bignum_copy_row_from_table_32.o \ + bignum_ctd.o \ + bignum_ctz.o \ + bignum_demont.o \ + bignum_digit.o \ + bignum_digitsize.o \ + bignum_divmod10.o \ + bignum_emontredc.o \ + bignum_eq.o \ + bignum_even.o \ + bignum_ge.o \ + bignum_gt.o \ + bignum_iszero.o \ + bignum_le.o \ + bignum_lt.o \ + bignum_madd.o \ + bignum_modadd.o \ + bignum_moddouble.o \ + bignum_modexp.o \ + bignum_modifier.o \ + bignum_modinv.o \ + bignum_modoptneg.o \ + bignum_modsub.o \ + bignum_montifier.o \ + bignum_montmul.o \ + bignum_montredc.o \ + bignum_montsqr.o \ + bignum_mul.o \ + bignum_muladd10.o \ + bignum_mux.o \ + bignum_mux16.o \ + bignum_negmodinv.o \ + bignum_nonzero.o \ + bignum_normalize.o \ + bignum_odd.o \ + bignum_of_word.o \ + bignum_optadd.o \ + bignum_optneg.o \ + bignum_optsub.o \ + bignum_optsubadd.o \ + bignum_pow2.o \ + bignum_shl_small.o \ + bignum_shr_small.o \ + bignum_sqr.o \ + bignum_sub.o \ + word_bytereverse.o \ + word_clz.o \ + word_ctz.o \ + word_divstep59.o \ + word_max.o \ + word_min.o \ + word_negmodinv.o \ + word_popcount.o \ + word_recip.o + +%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - + +default: $(OBJ); + +clean:; rm -f *.o *.correct diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_add.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_add.S new file mode 100644 index 00000000000..05d3487ddee --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_add.S @@ -0,0 +1,121 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Add, z := x + y +// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] +// +// extern uint64_t bignum_add +// (uint64_t p, uint64_t *z, +// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Does the z := x + y operation, truncating modulo p words in general and +// returning a top carry (0 or 1) in the p'th place, only adding the input +// words below p (as well as m and n respectively) to get the sum and carry. +// +// Standard ARM ABI: X0 = p, X1 = z, X2 = m, X3 = x, X4 = n, X5 = y, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add) + .text + .balign 4 + +#define p x0 +#define z x1 +#define m x2 +#define x x3 +#define n x4 +#define y x5 +#define i x6 +#define a x7 +#define d x8 + + +S2N_BN_SYMBOL(bignum_add): + +// First clamp the two input sizes m := min(p,m) and n := min(p,n) since +// we'll never need words past the p'th. Can now assume m <= p and n <= p. +// Then compare the modified m and n and branch accordingly + + cmp m, p + csel m, p, m, cs + cmp n, p + csel n, p, n, cs + cmp m, n + bcc bignum_add_ylonger + +// The case where x is longer or of the same size (p >= m >= n) + + sub p, p, m + sub m, m, n + ands i, xzr, xzr + cbz n, bignum_add_xmainskip +bignum_add_xmainloop: + ldr a, [x, i, lsl #3] + ldr d, [y, i, lsl #3] + adcs a, a, d + str a, [z, i, lsl #3] + add i, i, #1 + sub n, n, #1 + cbnz n, bignum_add_xmainloop +bignum_add_xmainskip: + cbz m, bignum_add_xtopskip +bignum_add_xtoploop: + ldr a, [x, i, lsl #3] + adcs a, a, xzr + str a, [z, i, lsl #3] + add i, i, #1 + sub m, m, #1 + cbnz m, bignum_add_xtoploop +bignum_add_xtopskip: + cbnz p, bignum_add_tails + cset x0, cs + ret + +// The case where y is longer (p >= n > m) + +bignum_add_ylonger: + sub p, p, n + sub n, n, m + ands i, xzr, xzr + cbz m, bignum_add_ytoploop +bignum_add_ymainloop: + ldr a, [x, i, lsl #3] + ldr d, [y, i, lsl #3] + adcs a, a, d + str a, [z, i, lsl #3] + add i, i, #1 + sub m, m, #1 + cbnz m, bignum_add_ymainloop +bignum_add_ytoploop: + ldr a, [y, i, lsl #3] + adcs a, xzr, a + str a, [z, i, lsl #3] + add i, i, #1 + sub n, n, #1 + cbnz n, bignum_add_ytoploop +bignum_add_ytopskip: + cbnz p, bignum_add_tails + cset x0, cs + ret + +// Adding a non-trivial tail, when p > max(m,n) + +bignum_add_tails: + cset a, cs + str a, [z, i, lsl #3] + b bignum_add_tail +bignum_add_tailloop: + str xzr, [z, i, lsl #3] +bignum_add_tail: + add i, i, #1 + sub p, p, #1 + cbnz p, bignum_add_tailloop + mov x0, xzr + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontifier.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontifier.S new file mode 100644 index 00000000000..c5b1a9d9d52 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontifier.S @@ -0,0 +1,386 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compute "amontification" constant z :== 2^{128k} (congruent mod m) +// Input m[k]; output z[k]; temporary buffer t[>=k] +// +// extern void bignum_amontifier +// (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t); +// +// This is called "amontifier" because any other value x can now be mapped into +// the almost-Montgomery domain with an almost-Montgomery multiplication by z. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = t +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontifier) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontifier) + .text + .balign 4 + +#define k x0 +#define z x1 +#define m x2 +#define t x3 + +// Some variables + +#define i x4 +#define j x5 +#define h x6 +#define a x7 +#define l x8 +#define c x9 +#define b x10 +#define d x11 + +// Some aliases for the values b and d + +#define r x10 +#define q x11 + + +S2N_BN_SYMBOL(bignum_amontifier): + +// If k = 0 the whole operation is trivial + + cbz k, bignum_amontifier_end + +// Copy the input m into the temporary buffer t. The temporary register +// c matters since we want it to hold the highest digit, ready for the +// normalization phase. + + mov i, xzr +bignum_amontifier_copyinloop: + ldr c, [m, i, lsl #3] + str c, [t, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_amontifier_copyinloop + +// Do a rather stupid but constant-time digit normalization, conditionally +// shifting left (k-1) times based on whether the top word is zero. +// With careful binary striding this could be O(k*log(k)) instead of O(k^2) +// while still retaining the constant-time style. +// The "cmp c, xzr" sets the zeroness predicate (ZF) for the entire inner loop + + subs i, k, #1 + beq bignum_amontifier_normalized +bignum_amontifier_normloop: + mov j, xzr + cmp c, xzr + mov a, xzr +bignum_amontifier_shufloop: + mov c, a + ldr a, [t, j, lsl #3] + csel c, c, a, eq + str c, [t, j, lsl #3] + add j, j, #1 + sub d, j, k + cbnz d, bignum_amontifier_shufloop + subs i, i, #1 + bne bignum_amontifier_normloop + +// We now have the top digit nonzero, assuming the input was nonzero, +// and as per the invariant of the loop above, c holds that digit. So +// now just count c's leading zeros and shift t bitwise that many bits. + +bignum_amontifier_normalized: + clz c, c + + mov b, xzr + mov i, xzr + ands xzr, c, #63 + csetm l, ne + neg d, c +bignum_amontifier_bitloop: + ldr j, [t, i, lsl #3] + lsl a, j, c + orr a, a, b + lsr b, j, d + and b, b, l + str a, [t, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_amontifier_bitloop + +// Let h be the high word of n, which in all the in-scope cases is >= 2^63. +// Now successively form q = 2^i div h and r = 2^i mod h as i goes from +// 64 to 126. We avoid just using division out of constant-time concerns +// (at the least we would need to fix up h = 0 for out-of-scope inputs) and +// don't bother with Newton-Raphson, since this stupid simple loop doesn't +// contribute much of the overall runtime at typical sizes. + + sub h, k, #1 + ldr h, [t, h, lsl #3] + mov q, #1 + neg r, h + mov i, #62 +bignum_amontifier_estloop: + add q, q, q + mov a, h + sub a, a, r + cmp r, a // CF <=> r >= h - r <=> 2 * r >= h + csetm a, cs + sub q, q, a + add r, r, r + and a, a, h + sub r, r, a + subs i, i, #1 + bne bignum_amontifier_estloop + +// Strictly speaking the above loop doesn't quite give the true remainder +// and quotient in the special case r = h = 2^63, so fix it up. We get +// q = 2^63 - 1 and r = 2^63 and really want q = 2^63 and r = 0. This is +// supererogatory, because the main property of q used below still holds +// in this case unless the initial m = 1, and then anyway the overall +// specification (congruence modulo m) holds degenerately. But it seems +// nicer to get a "true" quotient and remainder. + + cmp r, h + csinc q, q, q, ne + +// So now we have q and r with 2^126 = q * h + r (imagining r = 0 in the +// fixed-up case above: note that we never actually use the computed +// value of r below and so didn't adjust it). And we can assume the ranges +// q <= 2^63 and r < h < 2^64. +// +// The idea is to use q as a first quotient estimate for a remainder +// of 2^{p+62} mod n, where p = 64 * k. We have, splitting n into the +// high and low parts h and l: +// +// 2^{p+62} - q * n = 2^{p+62} - q * (2^{p-64} * h + l) +// = 2^{p+62} - (2^{p-64} * (q * h) + q * l) +// = 2^{p+62} - 2^{p-64} * (2^126 - r) - q * l +// = 2^{p-64} * r - q * l +// +// Note that 2^{p-64} * r < 2^{p-64} * h <= n +// and also q * l < 2^63 * 2^{p-64} = 2^{p-1} <= n +// so |diff| = |2^{p-64} * r - q * l| < n. +// +// If in fact diff >= 0 then it is already 2^{p+62} mod n. +// otherwise diff + n is the right answer. +// +// To (maybe?) make the computation slightly easier we actually flip +// the sign and compute d = q * n - 2^{p+62}. Then the answer is either +// -d (when negative) or n - d; in either case we effectively negate d. +// This negating tweak in fact spoils the result for cases where +// 2^{p+62} mod n = 0, when we get n instead. However the only case +// where this can happen is m = 1, when the whole spec holds trivially, +// and actually the remainder of the logic below works anyway since +// the latter part of the code only needs a congruence for the k-digit +// result, not strict modular reduction (the doublings will maintain +// the non-strict inequality). + + mov c, xzr + adds i, xzr, xzr +bignum_amontifier_mulloop: + ldr a, [t, i, lsl #3] + mul l, q, a + adcs l, l, c + umulh c, q, a + str l, [z, i, lsl #3] + add i, i, #1 + sub a, i, k + cbnz a, bignum_amontifier_mulloop + + adc c, c, xzr + mov a, #0x4000000000000000 + subs c, c, a + csetm q, cs + +// Now do [c] * n - d for our final answer + + subs i, xzr, xzr +bignum_amontifier_remloop: + ldr a, [t, i, lsl #3] + ldr b, [z, i, lsl #3] + and a, a, q + sbcs a, a, b + str a, [z, i, lsl #3] + add i, i, #1 + sub a, i, k + cbnz a, bignum_amontifier_remloop + +// Now still need to do a couple of modular doublings to get us all the +// way up to 2^{p+64} == r from the initial 2^{p+62} == r (mod n). + + mov c, xzr + subs j, xzr, xzr +bignum_amontifier_dubloop1: + ldr a, [z, j, lsl #3] + extr c, a, c, #63 + ldr b, [t, j, lsl #3] + sbcs c, c, b + str c, [z, j, lsl #3] + mov c, a + add j, j, #1 + sub a, j, k + cbnz a, bignum_amontifier_dubloop1 + lsr c, c, #63 + sbc c, c, xzr + adds j, xzr, xzr +bignum_amontifier_corrloop1: + ldr a, [z, j, lsl #3] + ldr b, [t, j, lsl #3] + and b, b, c + adcs a, a, b + str a, [z, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_amontifier_corrloop1 + +// This is not exactly the same: we also copy output to t giving the +// initialization t_1 = r == 2^{p+64} mod n for the main loop next. + + mov c, xzr + subs j, xzr, xzr +bignum_amontifier_dubloop2: + ldr a, [z, j, lsl #3] + extr c, a, c, #63 + ldr b, [t, j, lsl #3] + sbcs c, c, b + str c, [z, j, lsl #3] + mov c, a + add j, j, #1 + sub a, j, k + cbnz a, bignum_amontifier_dubloop2 + lsr c, c, #63 + sbc c, c, xzr + adds j, xzr, xzr +bignum_amontifier_corrloop2: + ldr a, [z, j, lsl #3] + ldr b, [t, j, lsl #3] + and b, b, c + adcs a, a, b + str a, [z, j, lsl #3] + str a, [t, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_amontifier_corrloop2 + +// We then successively generate (k+1)-digit values satisfying +// t_i == 2^{p+64*i} mod n, each of which is stored in h::t. Finish +// initialization by zeroing h initially + + mov h, xzr + +// Then if t_i = 2^{p} * h + l +// we have t_{i+1} == 2^64 * t_i +// = (2^{p+64} * h) + (2^64 * l) +// == r * h + l<<64 +// Do this k more times so we end up == 2^{128*k+64}, one more than we want +// +// Writing B = 2^{64k}, the possible correction of adding r, which for +// a (k+1)-digit result is equivalent to subtracting q = 2^{64*(k+1)} - r +// would give the overall worst-case value minus q of +// [ B * (B^k - 1) + (B - 1) * r ] - [B^{k+1} - r] +// = B * (r - 1) < B^{k+1} so we keep inside k+1 digits as required. +// +// This implementation makes the shift implicit by starting b with the +// "previous" digit (initially 0) to offset things by 1. + + mov i, k +bignum_amontifier_modloop: + mov j, xzr + mov b, xzr + adds c, xzr, xzr +bignum_amontifier_cmaloop: + ldr a, [z, j, lsl #3] + mul l, h, a + adcs b, b, c + umulh c, h, a + adc c, c, xzr + adds l, b, l + ldr b, [t, j, lsl #3] + str l, [t, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_amontifier_cmaloop + + adcs h, b, c + + csetm l, cs + + adds j, xzr, xzr +bignum_amontifier_oaloop: + ldr a, [t, j, lsl #3] + ldr b, [z, j, lsl #3] + and b, b, l + adcs a, a, b + str a, [t, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_amontifier_oaloop + adc h, h, xzr + + subs i, i, #1 + bne bignum_amontifier_modloop + +// Now do one almost-Montgomery reduction w.r.t. the original m +// which lops off one 2^64 from the congruence and, with the usual +// almost-Montgomery correction, gets us back inside k digits for +// the end result. + + ldr a, [m] + lsl d, a, #2 + sub d, a, d + eor d, d, #2 + mov l, #1 + madd c, a, d, l + mul b, c, c + madd d, c, d, d + mul c, b, b + madd d, b, d, d + mul b, c, c + madd d, c, d, d + madd d, b, d, d + + ldr b, [t] + mul d, b, d + + mul l, d, a + umulh c, d, a + mov j, #1 + sub a, k, #1 + adds xzr, b, l + cbz a, bignum_amontifier_montend + +bignum_amontifier_montloop: + ldr a, [m, j, lsl #3] + ldr b, [t, j, lsl #3] + mul l, d, a + adcs b, b, c + umulh c, d, a + adc c, c, xzr + adds b, b, l + sub a, j, #1 + str b, [t, a, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_amontifier_montloop +bignum_amontifier_montend: + adcs h, h, c + csetm l, cs + sub a, k, #1 + str h, [t, a, lsl #3] + + subs j, xzr, xzr +bignum_amontifier_osloop: + ldr a, [t, j, lsl #3] + ldr b, [m, j, lsl #3] + and b, b, l + sbcs a, a, b + str a, [z, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_amontifier_osloop + +bignum_amontifier_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontmul.S new file mode 100644 index 00000000000..79e4a5f03ef --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontmul.S @@ -0,0 +1,180 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m) +// Inputs x[k], y[k], m[k]; output z[k] +// +// extern void bignum_amontmul +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); +// +// Does z :== (x * y / 2^{64k}) mod m, meaning that the result, in the native +// size k, is congruent modulo m, but might not be fully reduced mod m. This +// is why it is called *almost* Montgomery multiplication. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = y, X4 = m +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontmul) + .text + .balign 4 + +#define k x0 +#define z x1 +#define x x2 +#define y x3 +#define m x4 + +// Negated modular inverse +#define w x5 +// Top carry for k'th position +#define c0 x6 +// Additional top carry for (k+1)'th position +#define c1 x7 +// Outer loop counter +#define i x8 +// Home for i'th digit or Montgomery multiplier +#define d x9 +// Inner loop counter +#define j x10 +#define h x11 +#define e x12 +#define l x13 +#define a x14 + +// This is just a short-term temporary used in zero-test subtraction. +// It's aliased to the same register as "a" which is always safe here. + +#define t x14 + +// Some more intuitive names for temp regs in initial word-level negmodinv. +// These just use c0 and c1 again, which aren't initialized early on. + +#define one x6 +#define e1 x6 +#define e2 x7 +#define e4 x6 +#define e8 x7 + + +S2N_BN_SYMBOL(bignum_amontmul): + +// If k = 0 the whole operation is trivial + + cbz k, bignum_amontmul_end + +// Compute word-level negated modular inverse w for m == m[0]. +// This is essentially the same as word_negmodinv. + + ldr a, [m] + lsl w, a, #2 + sub w, a, w + eor w, w, #2 + mov one, #1 + madd e1, a, w, one + mul e2, e1, e1 + madd w, e1, w, w + mul e4, e2, e2 + madd w, e2, w, w + mul e8, e4, e4 + madd w, e4, w, w + madd w, e8, w, w + +// Initialize the output c0::z to zero so we can then consistently add rows. +// It would be a bit more efficient to special-case the zeroth row, but +// this keeps the code slightly simpler. + + mov i, xzr +bignum_amontmul_zoop: + str xzr, [z, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_amontmul_zoop + mov c0, xzr + +// Outer loop pulling down digits d=x[i], multiplying by y and reducing + + mov i, xzr +bignum_amontmul_outerloop: + +// Multiply-add loop where we always have CF + previous high part h to add in +// Note that in general we do need yet one more carry in this phase and hence +// initialize c1 with the top carry. + + ldr d, [x, i, lsl #3] + mov j, xzr + adds h, xzr, xzr +bignum_amontmul_maddloop: + ldr a, [y, j, lsl #3] + ldr e, [z, j, lsl #3] + mul l, d, a + adcs e, e, h + umulh h, d, a + adc h, h, xzr + adds e, e, l + str e, [z, j, lsl #3] + add j, j, #1 + sub t, j, k + cbnz t, bignum_amontmul_maddloop + adcs c0, c0, h + adc c1, xzr, xzr + +// Montgomery reduction loop, similar but offsetting writebacks + + ldr e, [z] + mul d, e, w + ldr a, [m] + mul l, d, a + umulh h, d, a + adds e, e, l // Will be zero but want the carry + mov j, #1 + sub t, k, #1 + cbz t, bignum_amontmul_montend +bignum_amontmul_montloop: + ldr a, [m, j, lsl #3] + ldr e, [z, j, lsl #3] + mul l, d, a + adcs e, e, h + umulh h, d, a + adc h, h, xzr + adds e, e, l + sub l, j, #1 + str e, [z, l, lsl #3] + add j, j, #1 + sub t, j, k + cbnz t, bignum_amontmul_montloop +bignum_amontmul_montend: + adcs h, c0, h + adc c0, c1, xzr + sub l, j, #1 + str h, [z, l, lsl #3] + +// End of outer loop + + add i, i, #1 + cmp i, k + bcc bignum_amontmul_outerloop + +// Now convert carry word, which is always in {0,1}, into a mask +// and do a masked subtraction of m for the final almost-Montgomery result. + + neg c0, c0 + subs j, xzr, xzr +bignum_amontmul_corrloop: + ldr a, [z, j, lsl #3] + ldr e, [m, j, lsl #3] + and e, e, c0 + sbcs a, a, e + str a, [z, j, lsl #3] + add j, j, #1 + sub t, j, k + cbnz t, bignum_amontmul_corrloop + +bignum_amontmul_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontredc.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontredc.S new file mode 100644 index 00000000000..79fef8e5be5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontredc.S @@ -0,0 +1,176 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m) +// Inputs x[n], m[k], p; output z[k] +// +// extern void bignum_amontredc +// (uint64_t k, uint64_t *z, +// uint64_t n, uint64_t *x, uint64_t *m, uint64_t p); +// +// Does a :== (x' / 2^{64p}) mod m where x' = x if n <= p + k and in general +// is the lowest (p+k) digits of x. That is, p-fold almost-Montgomery reduction +// w.r.t. a k-digit modulus m giving a k-digit answer. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x, X4 = m, X5 = p +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontredc) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontredc) + .text + .balign 4 + +#define k x0 +#define z x1 +#define n x2 +#define x x3 +#define m x4 +#define p x5 + +// Negated modular inverse +#define w x6 +// Outer loop counter +#define i x7 +// Inner loop counter +#define j x8 +// Home for Montgomery multiplier +#define d x9 +// Top carry for current window +#define c x14 + +#define h x10 +#define e x11 +#define l x12 +#define a x13 + +// Some more intuitive names for temp regs in initial word-level negmodinv. +// These just use i and j again, which aren't used early on. + +#define one x7 +#define e1 x7 +#define e2 x8 +#define e4 x7 +#define e8 x8 + + +S2N_BN_SYMBOL(bignum_amontredc): + +// If k = 0 the whole operation is trivial + + cbz k, bignum_amontredc_end + +// Compute word-level negated modular inverse w for m == m[0]. +// This is essentially the same as word_negmodinv. + + ldr a, [m] + lsl w, a, #2 + sub w, a, w + eor w, w, #2 + mov one, #1 + madd e1, a, w, one + mul e2, e1, e1 + madd w, e1, w, w + mul e4, e2, e2 + madd w, e2, w, w + mul e8, e4, e4 + madd w, e4, w, w + madd w, e8, w, w + +// Initialize z to the lowest k digits of the input, zero-padding if n < k. + + cmp n, k + csel j, k, n, cs + mov i, xzr + cbz j, bignum_amontredc_padloop +bignum_amontredc_copyloop: + ldr a, [x, i, lsl #3] + str a, [z, i, lsl #3] + add i, i, #1 + cmp i, j + bcc bignum_amontredc_copyloop + + cmp i, k + bcs bignum_amontredc_initialized + +bignum_amontredc_padloop: + str xzr, [z, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_amontredc_padloop + +bignum_amontredc_initialized: + mov c, xzr + +// Now if p = 0 that's the end of the operation + + cbz p, bignum_amontredc_end + +// Outer loop, just doing a standard Montgomery reduction on z + + mov i, xzr +bignum_amontredc_outerloop: + + ldr e, [z] + mul d, e, w + ldr a, [m] + mul l, d, a + umulh h, d, a + adds e, e, l // Will be zero but want the carry + mov j, #1 + sub a, k, #1 + cbz a, bignum_amontredc_montend +bignum_amontredc_montloop: + ldr a, [m, j, lsl #3] + ldr e, [z, j, lsl #3] + mul l, d, a + adcs e, e, h + umulh h, d, a + adc h, h, xzr + adds e, e, l + sub l, j, #1 + str e, [z, l, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_amontredc_montloop +bignum_amontredc_montend: + adcs h, h, c + adc c, xzr, xzr + add j, j, i + cmp j, n + bcs bignum_amontredc_offtheend + ldr a, [x, j, lsl #3] + adds h, h, a + adc c, c, xzr +bignum_amontredc_offtheend: + sub j, k, #1 + str h, [z, j, lsl #3] + +// End of outer loop + + add i, i, #1 + cmp i, p + bcc bignum_amontredc_outerloop + +// Now convert carry word, which is always in {0,1}, into a mask +// and do a masked subtraction of m for the final almost-Montgomery result. + + neg c, c + subs j, xzr, xzr +bignum_amontredc_corrloop: + ldr a, [z, j, lsl #3] + ldr e, [m, j, lsl #3] + and e, e, c + sbcs a, a, e + str a, [z, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_amontredc_corrloop + +bignum_amontredc_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontsqr.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontsqr.S new file mode 100644 index 00000000000..d927e137d84 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_amontsqr.S @@ -0,0 +1,180 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m) +// Inputs x[k], m[k]; output z[k] +// +// extern void bignum_amontsqr +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); +// +// Does z :== (x^2 / 2^{64k}) mod m, meaning that the result, in the native +// size k, is congruent modulo m, but might not be fully reduced mod m. This +// is why it is called *almost* Montgomery squaring. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = m +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontsqr) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontsqr) + .text + .balign 4 + +#define k x0 +#define z x1 +#define x x2 +#define m x3 + +// Negated modular inverse +#define w x4 +// Top carry for k'th position +#define c0 x5 +// Additional top carry for (k+1)'th position +#define c1 x6 +// Outer loop counter +#define i x7 +// Home for i'th digit or Montgomery multiplier +#define d x8 +// Inner loop counter +#define j x9 + +#define h x10 +#define e x11 +#define l x12 +#define a x13 + +// This is just a short-term temporary used in zero-test subtraction. +// It's aliased to the same register as "a" which is always safe here. + +#define t x13 + +// Some more intuitive names for temp regs in initial word-level negmodinv. +// These just use c0 and c1 again, which aren't initialized early on. + +#define one x5 +#define e1 x5 +#define e2 x6 +#define e4 x5 +#define e8 x6 + + +S2N_BN_SYMBOL(bignum_amontsqr): + +// If k = 0 the whole operation is trivial + + cbz k, bignum_amontsqr_end + +// Compute word-level negated modular inverse w for m == m[0]. +// This is essentially the same as word_negmodinv. + + ldr a, [m] + lsl w, a, #2 + sub w, a, w + eor w, w, #2 + mov one, #1 + madd e1, a, w, one + mul e2, e1, e1 + madd w, e1, w, w + mul e4, e2, e2 + madd w, e2, w, w + mul e8, e4, e4 + madd w, e4, w, w + madd w, e8, w, w + +// Initialize the output c0::z to zero so we can then consistently add rows. +// It would be a bit more efficient to special-case the zeroth row, but +// this keeps the code slightly simpler. + + mov i, xzr +bignum_amontsqr_zoop: + str xzr, [z, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_amontsqr_zoop + mov c0, xzr + +// Outer loop pulling down digits d=x[i], multiplying by x and reducing + + mov i, xzr +bignum_amontsqr_outerloop: + +// Multiply-add loop where we always have CF + previous high part h to add in +// Note that in general we do need yet one more carry in this phase and hence +// initialize c1 with the top carry. + + ldr d, [x, i, lsl #3] + mov j, xzr + adds h, xzr, xzr +bignum_amontsqr_maddloop: + ldr a, [x, j, lsl #3] + ldr e, [z, j, lsl #3] + mul l, d, a + adcs e, e, h + umulh h, d, a + adc h, h, xzr + adds e, e, l + str e, [z, j, lsl #3] + add j, j, #1 + sub t, j, k + cbnz t, bignum_amontsqr_maddloop + adcs c0, c0, h + adc c1, xzr, xzr + +// Montgomery reduction loop, similar but offsetting writebacks + + ldr e, [z] + mul d, e, w + ldr a, [m] + mul l, d, a + umulh h, d, a + adds e, e, l // Will be zero but want the carry + mov j, #1 + sub t, k, #1 + cbz t, bignum_amontsqr_montend +bignum_amontsqr_montloop: + ldr a, [m, j, lsl #3] + ldr e, [z, j, lsl #3] + mul l, d, a + adcs e, e, h + umulh h, d, a + adc h, h, xzr + adds e, e, l + sub l, j, #1 + str e, [z, l, lsl #3] + add j, j, #1 + sub t, j, k + cbnz t, bignum_amontsqr_montloop +bignum_amontsqr_montend: + adcs h, c0, h + adc c0, c1, xzr + sub l, j, #1 + str h, [z, l, lsl #3] + +// End of outer loop + + add i, i, #1 + cmp i, k + bcc bignum_amontsqr_outerloop + +// Now convert carry word, which is always in {0,1}, into a mask +// and do a masked subtraction of m for the final almost-Montgomery result. + + neg c0, c0 + subs j, xzr, xzr +bignum_amontsqr_corrloop: + ldr a, [z, j, lsl #3] + ldr e, [m, j, lsl #3] + and e, e, c0 + sbcs a, a, e + str a, [z, j, lsl #3] + add j, j, #1 + sub t, j, k + cbnz t, bignum_amontsqr_corrloop + +bignum_amontsqr_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_bitfield.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_bitfield.S new file mode 100644 index 00000000000..1630f0b0f62 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_bitfield.S @@ -0,0 +1,93 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Select bitfield starting at bit n with length l <= 64 +// Inputs x[k], n, l; output function return +// +// extern uint64_t bignum_bitfield +// (uint64_t k, uint64_t *x, uint64_t n, uint64_t l); +// +// One-word bitfield from a k-digit (digit=64 bits) bignum, in constant-time +// style. Bitfield starts at bit n and has length l, indexing from 0 (=LSB). +// Digits above the top are treated uniformly as zero, as usual. Since the +// result is returned in a single word, effectively we use l' = min(64,l) +// for the length. +// +// Standard ARM ABI: X0 = k, X1 = x, X2 = n, X3 = l, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_bitfield) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_bitfield) + .text + .balign 4 + +#define k x0 +#define x x1 +#define n x2 +#define l x3 + +#define d x4 +#define e x5 +#define i x6 +#define a x7 +#define m x8 + + +S2N_BN_SYMBOL(bignum_bitfield): + +// For length zero finish immediately (the return value in x0 is 0) + + cbz k, bignum_bitfield_end + +// Decompose the index into n = 64 * n + m, then increment n for next part + + and m, n, #63 + lsr n, n, #6 + add n, n, #1 + +// Run over the digits setting d = n'th and e = (n+1)'th + + mov i, xzr + mov e, xzr +bignum_bitfield_loop: + ldr a, [x, i, lsl #3] + cmp i, n + csel d, a, d, cc + csel e, a, e, eq + add i, i, #1 + cmp i, k + bcc bignum_bitfield_loop + +// Override d with 0 if we ran off the end (e will retain original 0). + + cmp i, n + csel d, xzr, d, cc + +// Override e if we have m = 0 (i.e. original n was divisible by 64) +// This is because then we want to shift it right by 64 below. + + cmp m, xzr + csel e, xzr, e, eq + +// Combine shifted digits to get the bitfield(n,64) + + lsr d, d, m + neg m, m + lsl e, e, m + orr a, d, e + +// Now mask it down to get bitfield (n,l) + + cmp l, #64 + cset m, cc + lsl m, m, l + sub m, m, #1 + and x0, a, m +bignum_bitfield_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_bitsize.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_bitsize.S new file mode 100644 index 00000000000..e1a4a6dafde --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_bitsize.S @@ -0,0 +1,67 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Return size of bignum in bits +// Input x[k]; output function return +// +// extern uint64_t bignum_bitsize (uint64_t k, uint64_t *x); +// +// In the case of a zero bignum as input the result is 0 +// +// In principle this has a precondition k < 2^58, but obviously that +// is always true in practice because of address space limitations. +// +// Standard ARM ABI: X0 = k, X1 = x, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_bitsize) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_bitsize) + .text + .balign 4 + +#define k x0 +#define x x1 +#define i x2 +#define w x3 +#define a x4 +#define j x5 + + +S2N_BN_SYMBOL(bignum_bitsize): + +// If the bignum is zero-length, x0 is already the right answer of 0 + + cbz k, bignum_bitsize_end + +// Use w = a[i-1] to store nonzero words in a bottom-up sweep +// Set the initial default to be as if we had a 11...11 word directly below + + mov i, xzr + mov w, #-1 + mov j, xzr +bignum_bitsize_loop: + ldr a, [x, j, lsl #3] + add j, j, #1 + cmp a, #0 + csel i, j, i, ne + csel w, a, w, ne + cmp j, k + bne bignum_bitsize_loop + +// Now w = a[i-1] is the highest nonzero word, or in the zero case the +// default of the "extra" 11...11 = a[0-1]. We now want 64* i - clz(w). +// Note that this code does not rely on the behavior of the clz instruction +// for zero inputs, though the ARM manual does in fact guarantee clz(0) = 64. + + lsl i, i, #6 + clz a, w + sub x0, i, a + +bignum_bitsize_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cdiv.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cdiv.S new file mode 100644 index 00000000000..7e6480c2a03 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cdiv.S @@ -0,0 +1,278 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Divide by a single (nonzero) word, z := x / m and return x mod m +// Inputs x[n], m; outputs function return (remainder) and z[k] +// +// extern uint64_t bignum_cdiv +// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m); +// +// Does the "z := x / m" operation where x is n digits, result z is k. +// Truncates the quotient in general, but always (for nonzero m) returns +// the true remainder x mod m. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x, X4 = m, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cdiv) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cdiv) + .text + .balign 4 + +#define k x0 +#define z x1 +#define n x2 +#define x x3 +#define m x4 + +// Main variables + +#define w x5 +#define i x6 +#define a x7 +#define c x8 +#define d x9 +#define e x10 +#define f x11 +#define l x12 + +// These two are the same + +#define h x13 +#define q x13 + +// Variables for the negmodinv + +#define one x6 +#define e1 x6 +#define e2 x7 +#define e4 x6 +#define e8 x7 + +// Variable to hold the remainder + +#define r x14 + +S2N_BN_SYMBOL(bignum_cdiv): + +// Effectively the same dataflow as bignum_cmod, with some basic +// variable changes (using n for the size not k, returning r, etc.) +// and using the i counter instead of modifying the size as a loop +// counter. + + mov r, xzr + cbz n, bignum_cdiv_nomodulus + + clz e, m + lsl f, m, e + + lsr a, f, #16 + eor w, a, #0x1ffffffffffff + add a, a, #0x1 + lsr w, w, #32 + mneg r, a, w + lsr d, r, #49 + mul d, d, d + lsr r, r, #34 + add r, d, r + orr d, d, #0x40000000 + mul d, r, d + lsr d, d, #30 + lsl r, w, #30 + madd w, w, d, r + lsr w, w, #30 + mneg r, a, w + lsr r, r, #24 + mul r, r, w + lsl w, w, #16 + lsr r, r, #24 + add w, w, r + mneg r, a, w + lsr r, r, #32 + mul r, r, w + lsl w, w, #31 + lsr r, r, #17 + add w, w, r + mul d, f, w + umulh r, f, w + extr d, r, d, #60 + lsr r, w, #33 + mvn d, d + mul d, r, d + lsl w, w, #1 + lsr d, d, #33 + add w, w, d + adds d, w, #0x1 + cinv d, d, eq + umulh r, f, d + adds xzr, r, f + csel w, w, d, cs + + mneg r, w, f + + mov h, xzr + mov l, xzr + mov i, n +bignum_cdiv_modloop: + sub i, i, #1 + ldr d, [x, i, lsl #3] + mul a, r, h + umulh h, r, h + adds a, a, d + adcs h, h, l + csel l, r, xzr, cs + adds l, l, a + adc h, h, xzr + cbnz i, bignum_cdiv_modloop + + umulh c, w, h + adds c, c, h + csel r, f, xzr, cs + + mul a, c, f + umulh d, c, f + add d, d, r + subs l, l, a + sbcs h, h, d + + csel a, f, xzr, ne + subs l, l, a + sbcs h, h, xzr + + csel a, f, xzr, ne + sub l, l, a + + umulh c, w, l + adds c, c, l + cset r, cs + extr c, r, c, #1 + + eor e, e, #63 + lsr c, c, e + + mul a, c, m + sub l, l, a + + subs r, l, m + csel r, r, l, cs + +bignum_cdiv_nomodulus: + +// If k = 0 then there's no more to be done + + cbz k, bignum_cdiv_end + +// Let e be the number of trailing zeros in m. This implementation uses +// 63 - clz(-m & m) which is a bit slicker than the main word_ctz function +// but fails for m = 0. We don't have to worry about that case here. + + neg e, m + and e, e, m + clz e, e + eor e, e, #63 + +// Also generate a corresponding bitmask f for selecting bottom 64 - e bits. + + mov f, #-1 + lsr f, f, e + +// Now just shift m right by e bits. So hereafter we can assume m is odd +// but we first need to shift the input right by e bits then divide by m. + + lsr m, m, e + +// Compute the negated modular inverse w with w * m + 1 == 0 (mod 2^64) +// This is essentially the same as word_negmodinv. + + sub w, m, m, lsl #2 + eor w, w, #2 + mov one, #1 + madd e1, m, w, one + mul e2, e1, e1 + madd w, e1, w, w + mul e4, e2, e2 + madd w, e2, w, w + mul e8, e4, e4 + madd w, e4, w, w + madd w, e8, w, w + +// We have the remainder r, so now x = m * y + r for some quotient y +// to be computed. Consider x' = x + (m - r) = m * (y + 1) and do a +// Montgomery reduction, keeping the cofactor z. This gives us +// x' + m * z = 2^{64k} * c where c <= m. Thus since x' = m * (y + 1) +// we have +// +// m * (y + z + 1) = 2^{64k} * c +// +// This means m * (y + z + 1) == 0 (mod 2^{64k}), even when we truncate +// x to k digits (if in fact k < n). Since m is odd, it's coprime to +// 2^{64k} so we can cancel and get y + z + 1 == 0 (mod 2^{64k}), and +// hence using logical complement y == ~z (mod 2^{64k}). Thus we can +// write back the logical complements of the cofactor as the answer. +// Start with carry word c = m - r/2^e to make the initial tweak +// x' = x + (m - r); since we've shifted everything initially by e +// we need to shift the remainder too before subtracting from the +// shifted m. + + lsr c, r, e + sub c, m, c + mov i, xzr + +// Unless n = 0, preload the zeroth digit shifted right e places and bump +// up the x pointer by 8 and n down by 1, to ease indexing and comparison +// using the same variable i in the main loop. When n = 0 we leave it alone, +// as the comparison i < n will always fail and the x pointer is unused. + + mov d, xzr + cbz n, bignum_cdiv_loop + ldr d, [x], #8 + lsr d, d, e + sub n, n, 1 + +bignum_cdiv_loop: + +// Load the next digit up to get [l,d] then shift right e places, +// eventually setting d back to the other part of the newly loaded digit +// ready for the next time round the loop. + + mov l, xzr + cmp i, n + bcs bignum_cdiv_noload + ldr l, [x, i, lsl #3] +bignum_cdiv_noload: + rorv l, l, e + bic a, l, f + orr a, d, a + and d, l, f + +// Now a is the next digit after shifting right by e places, c the carry-in. +// Do the main Montgomery step with the (odd) m, writing back ~q. + + adds a, a, c + mul q, a, w + cset c, cs + mvn l, q + str l, [z, i, lsl #3] + + mul l, q, m + umulh h, q, m + + adds l, l, a + adc c, h, c + + add i, i, #1 + cmp i, k + bcc bignum_cdiv_loop + +// And return the remainder + +bignum_cdiv_end: + + mov x0, r + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cdiv_exact.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cdiv_exact.S new file mode 100644 index 00000000000..cdf0eff0ff4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cdiv_exact.S @@ -0,0 +1,162 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Divide by a single word, z := x / m *when known to be exact* +// Inputs x[n], m; output z[k] +// +// extern void bignum_cdiv_exact +// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m); +// +// Does the "z := x / m" operation where x is n digits and result z is k, +// *assuming* that m is nonzero and that the input x is in fact an +// exact multiple of m. (If this isn't known, use the general bignum_cdiv +// function instead.) In general the result is truncated to k digits. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x, X4 = m +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cdiv_exact) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cdiv_exact) + .text + .balign 4 + +#define k x0 +#define z x1 +#define n x2 +#define x x3 +#define m x4 + +// Main variables + +#define w x5 +#define i x6 +#define a x7 +#define c x8 +#define d x9 +#define e x10 +#define f x11 +#define l x12 + +// These two are the same + +#define h x13 +#define q x13 + +// Variables for the negmodinv + +#define one x6 +#define e1 x6 +#define e2 x7 +#define e4 x6 +#define e8 x7 + +S2N_BN_SYMBOL(bignum_cdiv_exact): + +// If k = 0 then there's nothing to be done + + cbz k, bignum_cdiv_exact_end + +// Let e be the number of trailing zeros in m. This implementation uses +// 63 - clz(-m & m) which is a bit slicker than the main word_ctz function +// but fails for m = 0. We don't have to worry about that case here. + + neg e, m + and e, e, m + clz e, e + eor e, e, #63 + +// Also generate a corresponding bitmask f for selecting bottom 64 - e bits. + + mov f, #-1 + lsr f, f, e + +// Now just shift m right by e bits. So hereafter we can assume m is odd +// but we first need to shift the input right by e bits then divide by m. + + lsr m, m, e + +// Compute the negated modular inverse w with w * m + 1 == 0 (mod 2^64) +// This is essentially the same as word_negmodinv. + + sub w, m, m, lsl #2 + eor w, w, #2 + mov one, #1 + madd e1, m, w, one + mul e2, e1, e1 + madd w, e1, w, w + mul e4, e2, e2 + madd w, e2, w, w + mul e8, e4, e4 + madd w, e4, w, w + madd w, e8, w, w + +// Consider x' = x + m and do a Montgomery reduction, keeping the cofactor z. +// This gives us x' + m * z = 2^{64k} * c where c <= m. Assuming x = m * y +// we then have m * y + m + m * z = 2^{64k} * c, i.e. +// +// m * (y + z + 1) = 2^{64k} * c +// +// This means m * (y + z + 1) == 0 (mod 2^{64k}), even when we truncate +// x to k digits (if in fact k < n). Since m is odd, it's coprime to +// 2^{64k} so we can cancel and get y + z + 1 == 0 (mod 2^{64k}), and +// hence using logical complement y == ~z (mod 2^{64k}). Thus we can +// write back the logical complements of the cofactor as the answer. +// Start with carry word c = m to make the initial tweak x' = x + m. + + mov c, m + mov i, xzr + +// Unless n = 0, preload the zeroth digit shifted right e places and bump +// up the x pointer by 8 and n down by 1, to ease indexing and comparison +// using the same variable i in the main loop. When n = 0 we leave it alone, +// as the comparison i < n will always fail and the x pointer is unused. + + mov d, xzr + cbz n, bignum_cdiv_exact_loop + ldr d, [x], #8 + lsr d, d, e + sub n, n, 1 + +bignum_cdiv_exact_loop: + +// Load the next digit up to get [l,d] then shift right e places, +// eventually setting d back to the other part of the newly loaded digit +// ready for the next time round the loop. + + mov l, xzr + cmp i, n + bcs bignum_cdiv_exact_noload + ldr l, [x, i, lsl #3] +bignum_cdiv_exact_noload: + rorv l, l, e + bic a, l, f + orr a, d, a + and d, l, f + +// Now a is the next digit after shifting right by e places, c the carry-in. +// Do the main Montgomery step with the (odd) m, writing back ~q. + + adds a, a, c + mul q, a, w + cset c, cs + mvn l, q + str l, [z, i, lsl #3] + + mul l, q, m + umulh h, q, m + + adds l, l, a + adc c, h, c + + add i, i, #1 + cmp i, k + bcc bignum_cdiv_exact_loop + +bignum_cdiv_exact_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cld.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cld.S new file mode 100644 index 00000000000..3952abbc024 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cld.S @@ -0,0 +1,52 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count leading zero digits (64-bit words) +// Input x[k]; output function return +// +// extern uint64_t bignum_cld (uint64_t k, uint64_t *x); +// +// In the case of a zero bignum as input the result is k +// +// Standard ARM ABI: X0 = k, X1 = x, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cld) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cld) + .text + .balign 4 + +#define k x0 +#define x x1 +#define i x2 +#define a x3 +#define j x4 + + +S2N_BN_SYMBOL(bignum_cld): + +// If the bignum is zero-length, x0 is already the right answer of k = 0 + + cbz k, bignum_cld_end + +// Run over the words j = 0..i-1, and set i := j + 1 when hitting nonzero a[j] + + mov i, xzr + mov j, xzr +bignum_cld_loop: + ldr a, [x, j, lsl #3] + add j, j, #1 + cmp a, #0 + csel i, j, i, ne + cmp j, k + bne bignum_cld_loop + + sub x0, x0, i +bignum_cld_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_clz.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_clz.S new file mode 100644 index 00000000000..48c6da3f761 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_clz.S @@ -0,0 +1,68 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count leading zero bits +// Input x[k]; output function return +// +// extern uint64_t bignum_clz (uint64_t k, uint64_t *x); +// +// In the case of a zero bignum as input the result is 64 * k +// +// In principle this has a precondition k < 2^58, but obviously that +// is always true in practice because of address space limitations +// +// Standard ARM ABI: X0 = k, X1 = x, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_clz) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_clz) + .text + .balign 4 + +#define k x0 +#define x x1 +#define i x2 +#define w x3 +#define a x4 +#define j x5 + + +S2N_BN_SYMBOL(bignum_clz): + +// If the bignum is zero-length, x0 is already the right answer of 0 + + cbz k, bignum_clz_end + +// Use w = a[i-1] to store nonzero words in a bottom-up sweep +// Set the initial default to be as if we had a 11...11 word directly below + + mov i, xzr + mov w, #-1 + mov j, xzr +bignum_clz_loop: + ldr a, [x, j, lsl #3] + add j, j, #1 + cmp a, #0 + csel i, j, i, ne + csel w, a, w, ne + cmp j, k + bne bignum_clz_loop + +// Now w = a[i-1] is the highest nonzero word, or in the zero case the +// default of the "extra" 11...11 = a[0-1]. We now want 64*(k - i) + clz(w). +// Note that this code does not rely on the behavior of the clz instruction +// for zero inputs, though the ARM manual does in fact guarantee clz(0) = 64. + + sub k, k, i + lsl k, k, #6 + clz a, w + add x0, k, a + +bignum_clz_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmadd.S new file mode 100644 index 00000000000..6211707f5e9 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmadd.S @@ -0,0 +1,113 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply-add with single-word multiplier, z := z + c * y +// Inputs c, y[n]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_cmadd +// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); +// +// Does the "z := z + c * y" operation where y is n digits, result z is p. +// Truncates the result in general. +// +// The return value is a high/carry word that is meaningful when p = n + 1, or +// more generally when n <= p and the result fits in p + 1 digits. In these +// cases it gives the top digit of the (p + 1)-digit result. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = c, X3 = n, X4 = y, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd) + .text + .balign 4 + +#define p x0 +#define z x1 +#define c x2 +#define n x3 +#define x x4 + +#define i x5 +#define h x6 +#define l x7 +#define a x8 + +#define b x9 + + +S2N_BN_SYMBOL(bignum_cmadd): + +// First clamp the input size n := min(p,n) since we can never need to read +// past the p'th term of the input to generate p-digit output. +// Subtract p := p - min(n,p) so it holds the size of the extra tail needed + + cmp n, p + csel n, p, n, cs + sub p, p, n + +// Initialize high part h = 0; if n = 0 do nothing but return that zero + + adds h, xzr, xzr + cbz n, bignum_cmadd_end + +// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0 + + ldr a, [x] + mul l, c, a + umulh h, c, a + ldr b, [z] + adds b, b, l + str b, [z] + mov i, #8 + sub n, n, #1 + cbz n, bignum_cmadd_tail + +// Main loop, where we always have CF + previous high part h to add in + +bignum_cmadd_loop: + ldr a, [x, i] + ldr b, [z, i] + mul l, c, a + adcs b, b, h + umulh h, c, a + adc h, h, xzr + adds b, b, l + str b, [z, i] + add i, i, #8 + sub n, n, #1 + cbnz n, bignum_cmadd_loop + +// Propagate the carry all the way to the end with h as extra carry word + +bignum_cmadd_tail: + cbz p, bignum_cmadd_end + ldr b, [z, i] + adcs b, b, h + str b, [z, i] + mov h, xzr + sub p, p, #1 + cbz p, bignum_cmadd_end + +bignum_cmadd_tloop: + add i, i, #8 + ldr b, [z, i] + adcs b, b, xzr + str b, [z, i] + sub p, p, #1 + cbnz p, bignum_cmadd_tloop + +// Return the high/carry word. This gives the top word of the result provided +// n <= p and the result fits in p + 1 digits. More generally, indeed, the +// 2^64 * CF + return = the top part of the result whenever n <= p, though this +// is not very exploitable from a C call. + +bignum_cmadd_end: + adcs x0, h, xzr + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmnegadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmnegadd.S new file mode 100644 index 00000000000..549fba7c9cc --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmnegadd.S @@ -0,0 +1,127 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Negated multiply-add with single-word multiplier, z := z - c * y +// Inputs c, y[n]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_cmnegadd +// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); +// +// Does the "z := z - c * y" operation where y is n digits, result z is p. +// Truncates the result in general. +// +// The return value is a high/carry word that is meaningful when n <= p. +// It is interpreted negatively as z' - 2^{64k} * return = z - c * y. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = c, X3 = n, X4 = y, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmnegadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmnegadd) + .text + .balign 4 + +#define p x0 +#define z x1 +#define c x2 +#define n x3 +#define x x4 + +#define i x5 +#define h x6 +#define l x7 +#define a x8 + +#define b x9 + + +S2N_BN_SYMBOL(bignum_cmnegadd): + +// First clamp the input size n := min(p,n) since we can never need to read +// past the p'th term of the input to generate p-digit output. +// Subtract p := p - min(n,p) so it holds the size of the extra tail needed + + cmp n, p + csel n, p, n, cs + sub p, p, n + +// Initialize high part h = 0; if n = 0 do nothing but return that zero + + mov h, xzr + cbz n, bignum_cmnegadd_end + +// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * ~x_0 + c + + ldr a, [x] + mvn a, a + mul l, c, a + umulh h, c, a + adds l, l, c + adc h, h, xzr + ldr b, [z] + adds b, b, l + str b, [z] + mov i, #8 + sub n, n, #1 + cbz n, bignum_cmnegadd_tail + +// Main loop, where we always have CF + previous high part h to add in + +bignum_cmnegadd_loop: + ldr a, [x, i] + ldr b, [z, i] + mvn a, a + mul l, c, a + adcs b, b, h + umulh h, c, a + adc h, h, xzr + adds b, b, l + str b, [z, i] + add i, i, #8 + sub n, n, #1 + cbnz n, bignum_cmnegadd_loop + +// At this point we have 2^{64n} * (h + CF) + z' = z + c * (2^{64n} - x) +// so z' - 2^{64n} * (c - (h + CF)) = z - c * x. +// Since z - c * x < 2^{64n} we must have c - (h + CF) >= 0. +// Accumulate the negative carry in h for consistency with trivial cases. + +bignum_cmnegadd_tail: + adc h, h, xzr + sub h, c, h + +// Propagate the carry all the way to the end with h as extra carry word + + cbz p, bignum_cmnegadd_end + ldr b, [z, i] + subs b, b, h + str b, [z, i] + mov h, xzr + sub p, p, #1 + cbz p, bignum_cmnegadd_highend + +bignum_cmnegadd_tloop: + add i, i, #8 + ldr b, [z, i] + sbcs b, b, xzr + str b, [z, i] + sub p, p, #1 + cbnz p, bignum_cmnegadd_tloop + +// Adjust the high word with the inverted carry h := h + (1 - CF) + +bignum_cmnegadd_highend: + cset x0, cc + add h, h, x0 + +// Now copy h into the function return + +bignum_cmnegadd_end: + mov x0, h + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmod.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmod.S new file mode 100644 index 00000000000..baf57e8c4d5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmod.S @@ -0,0 +1,179 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Find bignum modulo a single word +// Input x[k], m; output function return +// +// extern uint64_t bignum_cmod (uint64_t k, uint64_t *x, uint64_t m); +// +// Returns x mod m, assuming m is nonzero. +// +// Standard ARM ABI: X0 = k, X1 = x, X2 = m, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmod) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmod) + .text + .balign 4 + +#define k x0 +#define x x1 +#define m x2 + +#define e x3 +#define n x4 +#define w x5 + +#define r x6 +#define h x7 +#define l x8 +#define a x9 +#define d x10 + +// We re-use the k argument for a quotient estimate when it is no longer +// needed for traversal (x0 is modified for the return value anyway). + +#define q x0 + +S2N_BN_SYMBOL(bignum_cmod): + +// If the bignum is zero-length, x0 is already the right answer of 0 + + cbz k, bignum_cmod_end + +// Find number of leading zeros of m and let n = 2^e m so that for an +// in-scope (nonzero) input m we have n >= 2^63, e <= 63. + + clz e, m + lsl n, m, e + +// A near-clone of word_recip so 2^64 + w = ceil(2^128 / n) - 1 + + lsr a, n, #16 + eor w, a, #0x1ffffffffffff + add a, a, #0x1 + lsr w, w, #32 + mneg r, a, w + lsr d, r, #49 + mul d, d, d + lsr r, r, #34 + add r, d, r + orr d, d, #0x40000000 + mul d, r, d + lsr d, d, #30 + lsl r, w, #30 + madd w, w, d, r + lsr w, w, #30 + mneg r, a, w + lsr r, r, #24 + mul r, r, w + lsl w, w, #16 + lsr r, r, #24 + add w, w, r + mneg r, a, w + lsr r, r, #32 + mul r, r, w + lsl w, w, #31 + lsr r, r, #17 + add w, w, r + mul d, n, w + umulh r, n, w + extr d, r, d, #60 + lsr r, w, #33 + mvn d, d + mul d, r, d + lsl w, w, #1 + lsr d, d, #33 + add w, w, d + adds d, w, #0x1 + cinv d, d, eq + umulh r, n, d + adds xzr, r, n + csel w, w, d, cs + +// Take the residue r = 2^128 - (2^64 + w) * n, which by the above bound +// we know fits in 64 bits. We know 2^128 == r (mod n) and hence (mod m). + + mneg r, w, n + +// Now just go down through the digits accumulating [h;l] == x (mod n) +// by 2^64 * [h;l] + d = 2^128 * h + [l;d] == r * h + [l; d]. That addition +// may overflow with a carry, say 2^128 + [h';l'] = r * h + [l; d], in +// which case we subtract 2^128 - r (which is divisible by m and keeping +// things in 128 bits we just add r). Thus the overall bound when we initially +// overflow is r * h + [l; d] - (2^128 - r) = r * (h + 1) + [l; d] - 2^128 +// < 2^128 so we stay inside 2 words + + mov h, xzr + mov l, xzr +bignum_cmod_loop: + sub k, k, #1 + ldr d, [x, k, lsl #3] + mul a, r, h + umulh h, r, h + adds a, a, d + adcs h, h, l + csel l, r, xzr, cs + adds l, l, a + adc h, h, xzr + cbnz k, bignum_cmod_loop + +// Now do reciprocal multiplication to reduce the 2-word modular equivalent +// [h;l] to the single word l. If we assume the truncations are as follows +// 2^64 + w = 2^128 / n - epsilon (0 <= epsilon <= 1) +// q = (w * h / 2^64) - delta (0 <= delta <= 1) +// the net remainder is l + (h/2^64 * epsilon + delta) * n < l + 2 * n. +// In general this needs two rounds of comparison to guarantee getting +// into a single word (though one more mul could be used instead). +// Also, the quotient estimate can overflow so we use r as extra addend +// 2^64 * n when the initial addition overflows. The overall multiple +// of n can't itself overflow, since we know it's an underestimate of +// the initial residue. + + umulh q, w, h + adds q, q, h + csel r, n, xzr, cs + + mul a, q, n + umulh d, q, n + add d, d, r + subs l, l, a + sbcs h, h, d + + csel a, n, xzr, ne + subs l, l, a + sbcs h, h, xzr + + csel a, n, xzr, ne + sub l, l, a + +// One more reciprocal multiplication to do a modular reduction, but now in +// one word and in terms of the original m. For the quotient estimate we want +// q = ((2^64 + w) * l) / 2^{128-e} = ((2^64 + w) * l) / 2^65 / 2^{63-e}. + + umulh q, w, l + adds q, q, l + cset r, cs + extr q, r, q, #1 + + eor e, e, #63 + lsr q, q, e + + mul a, q, m + sub l, l, a + +// Note that since there is no neglected "low" part of the single word, +// one round of correction suffices; in the analog of the above l = 0 +// and hence the residue so far is already < 2 * m. + + subs x0, l, m + csel x0, x0, l, cs + +bignum_cmod_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmul.S new file mode 100644 index 00000000000..c17cd621568 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_cmul.S @@ -0,0 +1,104 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word, z := c * y +// Inputs c, y[n]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_cmul +// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); +// +// Does the "z := c * y" operation where y is n digits, result z is p. +// Truncates the result in general unless p >= n + 1. +// +// The return value is a high/carry word that is meaningful when p >= n as +// giving the high part of the result. Since this is always zero if p > n, +// it is mainly of interest in the special case p = n, i.e. where the source +// and destination have the same nominal size, when it gives the extra word +// of the full result. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = c, X3 = n, X4 = y, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul) + .text + .balign 4 + +#define p x0 +#define z x1 +#define c x2 +#define n x3 +#define x x4 + +#define i x5 +#define h x6 +#define l x7 +#define a x8 + + +S2N_BN_SYMBOL(bignum_cmul): + +// First clamp the input size n := min(p,n) since we can never need to read +// past the p'th term of the input to generate p-digit output. +// Subtract p := p - min(n,p) so it holds the size of the extra tail needed + + cmp n, p + csel n, p, n, cs + sub p, p, n + +// Initialize current input/output pointer offset i and high part h. +// But then if n = 0 skip the multiplication and go to the tail part + + mov h, xzr + mov i, xzr + cbz n, bignum_cmul_tail + +// Initialization of the loop: [h,l] = c * x_0 + + ldr a, [x] + mul l, c, a + umulh h, c, a + str l, [z] + add i, i, #8 + subs n, n, #1 + beq bignum_cmul_tail + +// Main loop (force CF = 0 at the beginning) + + adds xzr, xzr, xzr +bignum_cmul_loop: + ldr a, [x, i] + mul l, c, a + adcs l, l, h + umulh h, c, a + str l, [z, i] + add i, i, #8 + sub n, n, #1 + cbnz n, bignum_cmul_loop + + adc h, h, xzr + +bignum_cmul_tail: + cbz p, bignum_cmul_end + str h, [z, i] + mov h, xzr + subs p, p, #1 + beq bignum_cmul_end + +bignum_cmul_tloop: + add i, i, #8 + str xzr, [z, i] + sub p, p, #1 + cbnz p, bignum_cmul_tloop + +// Return the high/carry word + +bignum_cmul_end: + mov x0, h + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_coprime.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_coprime.S new file mode 100644 index 00000000000..3bed2cc8ae1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_coprime.S @@ -0,0 +1,450 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Test bignums for coprimality, gcd(x,y) = 1 +// Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)] +// +// extern uint64_t bignum_coprime +// (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y, uint64_t *t); +// +// Test for whether two bignums are coprime (no common factor besides 1). +// This is equivalent to testing if their gcd is 1, but a bit faster than +// doing those two computations separately. +// +// Here bignum x is m digits long, y is n digits long and the temporary +// buffer t needs to be 2 * max(m,n) digits long. The return value is +// 1 if coprime(x,y) and 0 otherwise. +// +// Standard ARM ABI: X0 = m, X1 = x, X2 = n, X3 = y, X4 = t, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_coprime) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_coprime) + .text + .balign 4 + +#define CHUNKSIZE 58 + +// Pervasive variables + +#define k x9 +#define m x4 +#define n x5 + +// Used via parameters in copy-in loop, then re-used as outer loop +// counter t and adaptive precision digit size l, which becomes a +// reduced version of k in later iterations but starts at l = k + +#define x x1 +#define y x3 + +#define t x2 +#define l x3 + +// The matrix of update factors to apply to m and n +// Also used a couple of additional temporary variables for the swapping loop +// Also used as an extra down-counter in corrective negation loops + +#define m_m x6 +#define m_n x7 +#define n_m x8 +#define n_n x1 + +#define t3 x6 +#define t4 x7 + +#define j x6 + +// General temporary variables and loop counters + +#define i x10 +#define t1 x11 +#define t2 x12 + +// High and low proxies for the inner loop +// Then re-used for high and carry words during actual cross-multiplications + +#define m_hi x13 +#define n_hi x14 +#define m_lo x15 +#define n_lo x16 + +#define h1 x13 +#define h2 x14 +#define l1 x15 +#define l2 x16 + +#define c1 x17 +#define c2 x19 +#define tt x20 + + +S2N_BN_SYMBOL(bignum_coprime): + +// We make use of just a couple of additional registers + + stp x19, x20, [sp, #-16]! + +// Compute k = max(m,n), and if this is zero skip to the end. Note that +// in this case x0 = m = 0 so we return the right answer of "false" + + cmp x0, x2 + csel k, x2, x0, cc + cbz k, bignum_coprime_end + +// Set up inside w two size-k buffers m and n + + lsl i, k, #3 + add n, m, i + +// Copy the input x into the buffer m, padding with zeros as needed + + mov i, xzr + cbz x0, bignum_coprime_xpadloop +bignum_coprime_xloop: + ldr t1, [x1, i, lsl #3] + str t1, [m, i, lsl #3] + add i, i, #1 + cmp i, x0 + bcc bignum_coprime_xloop + cmp i, k + bcs bignum_coprime_xskip +bignum_coprime_xpadloop: + str xzr, [m, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_coprime_xpadloop +bignum_coprime_xskip: + +// Copy the input y into the buffer n, padding with zeros as needed + + mov i, xzr + cbz x2, bignum_coprime_ypadloop +bignum_coprime_yloop: + ldr t1, [x3, i, lsl #3] + str t1, [n, i, lsl #3] + add i, i, #1 + cmp i, x2 + bcc bignum_coprime_yloop + cmp i, k + bcs bignum_coprime_yskip +bignum_coprime_ypadloop: + str xzr, [n, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_coprime_ypadloop +bignum_coprime_yskip: + +// Set up the outer loop count of 64 * sum of input sizes. +// The invariant is that m * n < 2^t at all times. + + add t, x0, x2 + lsl t, t, #6 + +// Record for the very end the OR of the lowest words. +// If the bottom bit is zero we know both are even so the answer is false. +// But since this is constant-time code we still execute all the main part. + + ldr x0, [m] + ldr t3, [n] + orr x0, x0, t3 + +// Now if n is even trigger a swap of m and n. This ensures that if +// one or other of m and n is odd then we make sure now that n is, +// as expected by our invariant later on. + + and t3, t3, #1 + sub t3, t3, #1 + + mov i, xzr +bignum_coprime_swaploop: + ldr t1, [m, i, lsl #3] + ldr t2, [n, i, lsl #3] + eor t4, t1, t2 + and t4, t4, t3 + eor t1, t1, t4 + eor t2, t2, t4 + str t1, [m, i, lsl #3] + str t2, [n, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_coprime_swaploop + +// Start of the main outer loop iterated t / CHUNKSIZE times + +bignum_coprime_outerloop: + +// We need only bother with sharper l = min k (ceil(t/64)) digits +// Either both m and n fit in l digits, or m has become zero and so +// nothing happens in the loop anyway and this makes no difference. + + add i, t, #63 + lsr l, i, #6 + cmp l, k + csel l, k, l, cs + +// Select upper and lower proxies for both m and n to drive the inner +// loop. The lower proxies are simply the lowest digits themselves, +// m_lo = m[0] and n_lo = n[0], while the upper proxies are bitfields +// of the two inputs selected so their top bit (63) aligns with the +// most significant bit of *either* of the two inputs. + + mov h1, xzr // Previous high and low for m + mov l1, xzr + mov h2, xzr // Previous high and low for n + mov l2, xzr + mov c2, xzr // Mask flag: previous word of one was nonzero + // and in this case h1 and h2 are those words + + mov i, xzr +bignum_coprime_toploop: + ldr t1, [m, i, lsl #3] + ldr t2, [n, i, lsl #3] + orr c1, t1, t2 + cmp c1, xzr + and c1, c2, h1 + csel l1, c1, l1, ne + and c1, c2, h2 + csel l2, c1, l2, ne + csel h1, t1, h1, ne + csel h2, t2, h2, ne + csetm c2, ne + add i, i, #1 + cmp i, l + bcc bignum_coprime_toploop + + orr t1, h1, h2 + clz t2, t1 + negs c1, t2 + lsl h1, h1, t2 + csel l1, l1, xzr, ne + lsl h2, h2, t2 + csel l2, l2, xzr, ne + lsr l1, l1, c1 + lsr l2, l2, c1 + orr m_hi, h1, l1 + orr n_hi, h2, l2 + + ldr m_lo, [m] + ldr n_lo, [n] + +// Now the inner loop, with i as loop counter from CHUNKSIZE down. +// This records a matrix of updates to apply to the initial +// values of m and n with, at stage j: +// +// sgn * m' = (m_m * m - m_n * n) / 2^j +// -sgn * n' = (n_m * m - n_n * n) / 2^j +// +// where "sgn" is either +1 or -1, and we lose track of which except +// that both instance above are the same. This throwing away the sign +// costs nothing (since we have to correct in general anyway because +// of the proxied comparison) and makes things a bit simpler. But it +// is simply the parity of the number of times the first condition, +// used as the swapping criterion, fires in this loop. + + mov m_m, #1 + mov m_n, xzr + mov n_m, xzr + mov n_n, #1 + + mov i, #CHUNKSIZE + +// Conceptually in the inner loop we follow these steps: +// +// * If m_lo is odd and m_hi < n_hi, then swap the four pairs +// (m_hi,n_hi); (m_lo,n_lo); (m_m,n_m); (m_n,n_n) +// +// * Now, if m_lo is odd (old or new, doesn't matter as initial n_lo is odd) +// m_hi := m_hi - n_hi, m_lo := m_lo - n_lo +// m_m := m_m + n_m, m_n := m_n + n_n +// +// * Halve and double them +// m_hi := m_hi / 2, m_lo := m_lo / 2 +// n_m := n_m * 2, n_n := n_n * 2 +// +// The actual computation computes updates before actually swapping and +// then corrects as needed. It also maintains the invariant ~ZF <=> odd(m_lo), +// since it seems to reduce the dependent latency. Set that up first. + + ands xzr, m_lo, #1 + +bignum_coprime_innerloop: + +// At the start of the loop ~ZF <=> m_lo is odd; mask values accordingly +// Set the flags for m_hi - [~ZF] * n_hi so we know to flip things. + + csel t1, n_hi, xzr, ne + csel t2, n_lo, xzr, ne + csel c1, n_m, xzr, ne + csel c2, n_n, xzr, ne + ccmp m_hi, n_hi, #0x2, ne + +// Compute subtractive updates, trivial in the case ZF <=> even(m_lo). + + sub t1, m_hi, t1 + sub t2, m_lo, t2 + +// If the subtraction borrows, swap things appropriately, negating where +// we've already subtracted so things are as if we actually swapped first. + + csel n_hi, n_hi, m_hi, cs + cneg t1, t1, cc + csel n_lo, n_lo, m_lo, cs + cneg m_lo, t2, cc + csel n_m, n_m, m_m, cs + csel n_n, n_n, m_n, cs + +// Update and shift while setting oddness flag for next iteration +// We look at bit 1 of t2 (m_lo before possible negation), which is +// safe because it is even. + + ands xzr, t2, #2 + add m_m, m_m, c1 + add m_n, m_n, c2 + lsr m_hi, t1, #1 + lsr m_lo, m_lo, #1 + add n_m, n_m, n_m + add n_n, n_n, n_n + +// Next iteration; don't disturb the flags since they are used at entry + + sub i, i, #1 + cbnz i, bignum_coprime_innerloop + +// Now actually compute the updates to m and n corresponding to that matrix, +// and correct the signs if they have gone negative. First we compute the +// (k+1)-sized updates +// +// c1::h1::m = m_m * m - m_n * n +// c2::h2::n = n_m * m - n_n * n +// +// then for each one, sign-correct and shift by CHUNKSIZE + + mov h1, xzr + mov h2, xzr + mov c1, xzr + mov c2, xzr + mov i, xzr +bignum_coprime_crossloop: + ldr t1, [m, i, lsl #3] + ldr t2, [n, i, lsl #3] + + mul l1, m_m, t1 + mul l2, m_n, t2 + adds l1, l1, h1 + umulh h1, m_m, t1 + adc h1, h1, xzr + umulh tt, m_n, t2 + sub c1, tt, c1 + subs l1, l1, l2 + str l1, [m, i, lsl #3] + sbcs h1, h1, c1 + csetm c1, cc + + mul l1, n_m, t1 + mul l2, n_n, t2 + adds l1, l1, h2 + umulh h2, n_m, t1 + adc h2, h2, xzr + umulh tt, n_n, t2 + sub c2, tt, c2 + subs l1, l1, l2 + str l1, [n, i, lsl #3] + sbcs h2, h2, c2 + csetm c2, cc + + add i, i, #1 + cmp i, l + bcc bignum_coprime_crossloop + +// Write back m optionally negated and shifted right CHUNKSIZE bits + + adds xzr, c1, c1 + + ldr l1, [m] + mov i, xzr + sub j, l, #1 + cbz j, bignum_coprime_negskip1 + +bignum_coprime_negloop1: + add t1, i, #8 + ldr t2, [m, t1] + extr l1, t2, l1, #CHUNKSIZE + eor l1, l1, c1 + adcs l1, l1, xzr + str l1, [m, i] + mov l1, t2 + add i, i, #8 + sub j, j, #1 + cbnz j, bignum_coprime_negloop1 +bignum_coprime_negskip1: + extr l1, h1, l1, #CHUNKSIZE + eor l1, l1, c1 + adcs l1, l1, xzr + str l1, [m, i] + +// Write back n optionally negated and shifted right CHUNKSIZE bits + + adds xzr, c2, c2 + + ldr l1, [n] + mov i, xzr + sub j, l, #1 + cbz j, bignum_coprime_negskip2 +bignum_coprime_negloop2: + add t1, i, #8 + ldr t2, [n, t1] + extr l1, t2, l1, #CHUNKSIZE + eor l1, l1, c2 + adcs l1, l1, xzr + str l1, [n, i] + mov l1, t2 + add i, i, #8 + sub j, j, #1 + cbnz j, bignum_coprime_negloop2 +bignum_coprime_negskip2: + extr l1, h2, l1, #CHUNKSIZE + eor l1, l1, c2 + adcs l1, l1, xzr + str l1, [n, i] + +// End of main loop. We can stop if t' <= 0 since then m * n < 2^0, which +// since n is odd (in the main cases where we had one or other input odd) +// means that m = 0 and n is the final gcd. Moreover we do in fact need to +// maintain strictly t > 0 in the main loop, or the computation of the +// optimized digit bound l could collapse to 0. + + subs t, t, #CHUNKSIZE + bhi bignum_coprime_outerloop + +// Now compare n with 1 (OR of the XORs in t1) + + ldr t1, [n] + eor t1, t1, #1 + cmp k, #1 + beq bignum_coprime_finalcomb + mov i, #1 +bignum_coprime_compareloop: + ldr t2, [n, i, lsl #3] + orr t1, t1, t2 + add i, i, #1 + cmp i, k + bcc bignum_coprime_compareloop + +// Now combine that with original oddness flag, which is still in x0 + +bignum_coprime_finalcomb: + cmp t1, xzr + cset t1, eq + and x0, x0, t1 + +bignum_coprime_end: + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy.S new file mode 100644 index 00000000000..a4c0a7fd818 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy.S @@ -0,0 +1,63 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Copy bignum with zero-extension or truncation, z := x +// Input x[n]; output z[k] +// +// extern void bignum_copy +// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x); +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy) + .text + .balign 4 + +#define k x0 +#define z x1 +#define n x2 +#define x x3 + +#define i x4 +#define a x5 + + +S2N_BN_SYMBOL(bignum_copy): + +// Replace n with min(k,n) so we are definitely safe copying those +// Initialize the element counter to 0 + + cmp k, n + csel n, k, n, cc + mov i, #0 + +// If min(k,n) = 0 jump to the padding stage + + cbz n, bignum_copy_padding + +bignum_copy_copyloop: + ldr a, [x, i, lsl #3] + str a, [z, i, lsl #3] + add i, i, #1 + cmp i, n + bcc bignum_copy_copyloop + +bignum_copy_padding: + cmp i, k + bcs bignum_copy_end +bignum_copy_padloop: + str xzr, [z, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_copy_padloop + +bignum_copy_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table.S similarity index 100% rename from third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table.S diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_16.S similarity index 92% rename from third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_16.S index ea0bef702ce..764fbbb8ffb 100644 --- a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_16.S @@ -8,7 +8,7 @@ // achieved by reading the whole table and using the bit-masking to get the // `idx`-th row. // -// extern void bignum_copy_from_table_16_neon +// extern void bignum_copy_from_table_16 // (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx); // // Initial version written by Hanno Becker @@ -16,8 +16,8 @@ // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_16_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_16_neon) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_16) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_16) .text .balign 4 @@ -57,7 +57,7 @@ #define vmask v17 -S2N_BN_SYMBOL(bignum_copy_row_from_table_16_neon): +S2N_BN_SYMBOL(bignum_copy_row_from_table_16): // Clear accumulator // Zeroing can be done via xor, but xor isn't formalized yet. @@ -71,7 +71,7 @@ S2N_BN_SYMBOL(bignum_copy_row_from_table_16_neon): mov ventry7.16b, ventry0.16b mov cnt, #0 -bignum_copy_row_from_table_16_neon_loop: +bignum_copy_row_from_table_16_loop: // Compute mask: Check if current index matches target index subs xzr, cnt, idx @@ -106,9 +106,9 @@ bignum_copy_row_from_table_16_neon_loop: add cnt, cnt, #1 subs xzr, height, cnt - b.ne bignum_copy_row_from_table_16_neon_loop + b.ne bignum_copy_row_from_table_16_loop -bignum_copy_row_from_table_16_neon_end: +bignum_copy_row_from_table_16_end: str qentry0, [z, #16*0] str qentry1, [z, #16*1] diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_32.S similarity index 94% rename from third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_32.S index c3dc386990f..02a1fdb9151 100644 --- a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_32.S @@ -8,7 +8,7 @@ // achieved by reading the whole table and using the bit-masking to get the // `idx`-th row. // -// extern void bignum_copy_from_table_32_neon +// extern void bignum_copy_from_table_32 // (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx); // // Initial version written by Hanno Becker @@ -16,8 +16,8 @@ // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_32_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_32_neon) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_32) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_32) .text .balign 4 @@ -72,7 +72,7 @@ #define vmask v17 -S2N_BN_SYMBOL(bignum_copy_row_from_table_32_neon): +S2N_BN_SYMBOL(bignum_copy_row_from_table_32): // Clear accumulator // Zeroing can be done via xor, but xor isn't formalized yet. @@ -94,7 +94,7 @@ S2N_BN_SYMBOL(bignum_copy_row_from_table_32_neon): mov ventry15.16b, ventry0.16b mov cnt, #0 -bignum_copy_row_from_table_32_neon_loop: +bignum_copy_row_from_table_32_loop: // Compute mask: Check if current index matches target index subs xzr, cnt, idx @@ -153,9 +153,9 @@ bignum_copy_row_from_table_32_neon_loop: add cnt, cnt, #1 subs xzr, height, cnt - b.ne bignum_copy_row_from_table_32_neon_loop + b.ne bignum_copy_row_from_table_32_loop -bignum_copy_row_from_table_32_neon_end: +bignum_copy_row_from_table_32_end: str qentry0, [z, #16*0] str qentry1, [z, #16*1] diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_8n.S similarity index 77% rename from third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_8n.S index b065a70525c..8c21b1b8488 100644 --- a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_8n_neon.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_copy_row_from_table_8n.S @@ -8,15 +8,15 @@ // achieved by reading the whole table and using the bit-masking to get the // `idx`-th row. // -// extern void bignum_copy_from_table_8_neon +// extern void bignum_copy_from_table_8n // (uint64_t *z, uint64_t *table, uint64_t height, uint64_t width, uint64_t idx); // // Standard ARM ABI: X0 = z, X1 = table, X2 = height, X3 = width, X4 = idx // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_8n_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_8n_neon) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy_row_from_table_8n) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy_row_from_table_8n) .text .balign 4 @@ -33,27 +33,27 @@ #define vmask v16 -S2N_BN_SYMBOL(bignum_copy_row_from_table_8n_neon): +S2N_BN_SYMBOL(bignum_copy_row_from_table_8n): - cbz height, bignum_copy_row_from_table_8n_neon_end - cbz width, bignum_copy_row_from_table_8n_neon_end + cbz height, bignum_copy_row_from_table_8n_end + cbz width, bignum_copy_row_from_table_8n_end mov i, width mov x6, z dup v16.2d, xzr -bignum_copy_row_from_table_8n_neon_initzero: +bignum_copy_row_from_table_8n_initzero: str q16, [x6] str q16, [x6, #16] str q16, [x6, #32] str q16, [x6, #48] add x6, x6, #64 subs i, i, #8 - bne bignum_copy_row_from_table_8n_neon_initzero + bne bignum_copy_row_from_table_8n_initzero mov i, xzr mov x8, table -bignum_copy_row_from_table_8n_neon_outerloop: +bignum_copy_row_from_table_8n_outerloop: cmp i, idx csetm mask, eq @@ -62,7 +62,7 @@ bignum_copy_row_from_table_8n_neon_outerloop: mov j, width mov x9, z -bignum_copy_row_from_table_8n_neon_innerloop: +bignum_copy_row_from_table_8n_innerloop: ldr q17, [x8] ldr q18, [x9] @@ -87,14 +87,14 @@ bignum_copy_row_from_table_8n_neon_innerloop: add x8, x8, #64 add x9, x9, #64 subs j, j, #8 - bne bignum_copy_row_from_table_8n_neon_innerloop + bne bignum_copy_row_from_table_8n_innerloop -bignum_copy_row_from_table_8n_neon_innerloop_done: +bignum_copy_row_from_table_8n_innerloop_done: add i, i, #1 cmp i, height - bne bignum_copy_row_from_table_8n_neon_outerloop + bne bignum_copy_row_from_table_8n_outerloop -bignum_copy_row_from_table_8n_neon_end: +bignum_copy_row_from_table_8n_end: ret #if defined(__linux__) && defined(__ELF__) diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ctd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ctd.S new file mode 100644 index 00000000000..8a721fc1516 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ctd.S @@ -0,0 +1,53 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count trailing zero digits (64-bit words) +// Input x[k]; output function return +// +// extern uint64_t bignum_ctd (uint64_t k, uint64_t *x); +// +// In the case of a zero bignum as input the result is k +// +// Standard ARM ABI: X0 = k, X1 = x, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ctd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ctd) + .text + .balign 4 + +#define k x0 +#define x x1 +#define i x2 +#define a x3 + + +S2N_BN_SYMBOL(bignum_ctd): + +// If the bignum is zero-length, x0 is already the right answer of 0 + + cbz k, bignum_ctd_end + +// Record in i that the lowest nonzero word is i, where i = k means +// that the bignum was entirely zero + + mov i, k +bignum_ctd_loop: + sub k, k, #1 + ldr a, [x, k, lsl #3] + cmp a, #0 + csel i, k, i, ne + cbnz k, bignum_ctd_loop + +// Now return i + + mov x0, i + +bignum_ctd_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ctz.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ctz.S new file mode 100644 index 00000000000..8977925e0e1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ctz.S @@ -0,0 +1,73 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count trailing zero bits +// Input x[k]; output function return +// +// extern uint64_t bignum_ctz (uint64_t k, uint64_t *x); +// +// +// In the case of a zero bignum as input the result is 64 * k +// +// In principle this has a precondition k < 2^58, but obviously that +// is always true in practice because of address space limitations +// +// Standard ARM ABI: X0 = k, X1 = x, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ctz) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ctz) + .text + .balign 4 + +#define k x0 +#define x x1 +#define i x2 +#define w x3 +#define a x4 + + +S2N_BN_SYMBOL(bignum_ctz): + +// If the bignum is zero-length, x0 is already the right answer of 0 + + cbz k, bignum_ctz_end + +// Use w = a[i] to store nonzero words in a top-down sweep +// Set the initial default to be as if we had a 1 word directly above + + mov i, k + mov w, #1 + +bignum_ctz_loop: + sub k, k, #1 + ldr a, [x, k, lsl #3] + cmp a, #0 + csel i, k, i, ne + csel w, a, w, ne + cbnz k, bignum_ctz_loop + +// Now w = a[i] is the lowest nonzero word, or in the zero case the +// default of the "extra" 1 = a[k]. We now want 64*i + ctz(w). +// +// ARM doesn't have a direct word ctz instruction, so we emulate it via +// ctz(w) = 64 - clz(~w & (w-1)). This is depending, for cases of the form +// ctz(....1), on the behavior clz(0) = 64, which is guaranteed according +// to the ARM manual. + + mvn a, w + sub w, w, #1 + add i, i, #1 + and w, w, a + lsl i, i, #6 + clz a, w + sub x0, i, a + +bignum_ctz_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_demont.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_demont.S new file mode 100644 index 00000000000..d93b5b0eacd --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_demont.S @@ -0,0 +1,156 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m +// Inputs x[k], m[k]; output z[k] +// +// extern void bignum_demont +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); +// +// Does z := (x / 2^{64k}) mod m, hence mapping out of Montgomery domain. +// In other words, this is a k-fold Montgomery reduction with same-size input. +// This can handle almost-Montgomery inputs, i.e. any k-digit bignum. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = m +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont) + .text + .balign 4 + +#define k x0 +#define z x1 +#define x x2 +#define m x3 + +// Negated modular inverse +#define w x4 +// Outer loop counter +#define i x5 +// Inner loop counter +#define j x6 +// Home for Montgomery multiplier +#define d x7 + +#define h x8 +#define e x9 +#define l x10 +#define a x11 + +// Some more intuitive names for temp regs in initial word-level negmodinv. +// These just use i and j again, which aren't used early on. + +#define one x5 +#define e1 x5 +#define e2 x6 +#define e4 x5 +#define e8 x6 + + +S2N_BN_SYMBOL(bignum_demont): + +// If k = 0 the whole operation is trivial + + cbz k, bignum_demont_end + +// Compute word-level negated modular inverse w for m == m[0]. +// This is essentially the same as word_negmodinv. + + ldr a, [m] + lsl w, a, #2 + sub w, a, w + eor w, w, #2 + mov one, #1 + madd e1, a, w, one + mul e2, e1, e1 + madd w, e1, w, w + mul e4, e2, e2 + madd w, e2, w, w + mul e8, e4, e4 + madd w, e4, w, w + madd w, e8, w, w + +// Initially just copy the input to the output. It would be a little more +// efficient but somewhat fiddlier to tweak the zeroth iteration below instead. + + mov i, xzr +bignum_demont_iloop: + ldr a, [x, i, lsl #3] + str a, [z, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_demont_iloop + +// Outer loop, just doing a standard Montgomery reduction on z + + mov i, xzr +bignum_demont_outerloop: + + ldr e, [z] + mul d, e, w + ldr a, [m] + mul l, d, a + umulh h, d, a + adds e, e, l // Will be zero but want the carry + mov j, #1 + sub a, k, #1 + cbz a, bignum_demont_montend +bignum_demont_montloop: + ldr a, [m, j, lsl #3] + ldr e, [z, j, lsl #3] + mul l, d, a + adcs e, e, h + umulh h, d, a + adc h, h, xzr + adds e, e, l + sub l, j, #1 + str e, [z, l, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_demont_montloop +bignum_demont_montend: + adc h, xzr, h + sub l, j, #1 + str h, [z, l, lsl #3] + +// End of outer loop + + add i, i, #1 + cmp i, k + bcc bignum_demont_outerloop + +// Now do a comparison of z with m to set a final correction mask +// indicating that z >= m and so we need to subtract m. + + subs j, xzr, xzr +bignum_demont_cmploop: + ldr a, [z, j, lsl #3] + ldr e, [m, j, lsl #3] + sbcs xzr, a, e + add j, j, #1 + sub a, j, k + cbnz a, bignum_demont_cmploop + csetm h, cs + +// Now do a masked subtraction of m for the final reduced result. + + subs j, xzr, xzr +bignum_demont_corrloop: + ldr a, [z, j, lsl #3] + ldr e, [m, j, lsl #3] + and e, e, h + sbcs a, a, e + str a, [z, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_demont_corrloop + +bignum_demont_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_digit.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_digit.S new file mode 100644 index 00000000000..2d261fd96f4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_digit.S @@ -0,0 +1,59 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Select digit x[n] +// Inputs x[k], n; output function return +// +// extern uint64_t bignum_digit (uint64_t k, uint64_t *x, uint64_t n); +// +// n'th digit of a k-digit (digit=64 bits) bignum, in constant-time style. +// Indexing starts at 0, which is the least significant digit (little-endian). +// Returns zero if n >= k, i.e. we read a digit off the end of the bignum. +// +// Standard ARM ABI: X0 = k, X1 = x, X2 = n, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_digit) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_digit) + .text + .balign 4 + +#define k x0 +#define x x1 +#define n x2 + +#define d x3 +#define i x4 +#define a x5 + + +S2N_BN_SYMBOL(bignum_digit): + +// For length zero finish immediately (the return value in x0 is 0) + + cbz k, bignum_digit_end + +// Set default of zero, run over all the digits and take note of the n'th one + + mov d, xzr + mov i, xzr +bignum_digit_loop: + ldr a, [x, i, lsl #3] + cmp i, n + csel d, a, d, eq + add i, i, #1 + cmp i, k + bcc bignum_digit_loop + +// Return + + mov x0, d + +bignum_digit_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_digitsize.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_digitsize.S new file mode 100644 index 00000000000..44d98c4e99b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_digitsize.S @@ -0,0 +1,52 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Return size of bignum in digits (64-bit word) +// Input x[k]; output function return +// +// extern uint64_t bignum_digitsize (uint64_t k, uint64_t *x); +// +// In the case of a zero bignum as input the result is 0 +// +// Standard ARM ABI: X0 = k, X1 = x, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_digitsize) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_digitsize) + .text + .balign 4 + +#define k x0 +#define x x1 +#define i x2 +#define a x3 +#define j x4 + + +S2N_BN_SYMBOL(bignum_digitsize): + +// If the bignum is zero-length, x0 is already the right answer of 0 + + cbz k, bignum_digitsize_end + +// Run over the words j = 0..i-1, and set i := j + 1 when hitting nonzero a[j] + + mov i, xzr + mov j, xzr +bignum_digitsize_loop: + ldr a, [x, j, lsl #3] + add j, j, #1 + cmp a, #0 + csel i, j, i, ne + cmp j, k + bne bignum_digitsize_loop + + mov x0, i +bignum_digitsize_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_divmod10.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_divmod10.S new file mode 100644 index 00000000000..2d3515217e1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_divmod10.S @@ -0,0 +1,79 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Divide bignum by 10, returning remainder: z' := z div 10, return = z mod 10 +// Inputs z[k]; outputs function return (remainder) and z[k] +// +// extern uint64_t bignum_divmod10 (uint64_t k, uint64_t *z); +// +// Standard ARM ABI: X0 = k, X1 = z, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_divmod10) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_divmod10) + .text + .balign 4 + +#define k x0 +#define z x1 + +#define d x2 + +#define h x3 +#define q x3 + +#define l x4 +#define r x4 + +#define w x5 +#define s x6 + +S2N_BN_SYMBOL(bignum_divmod10): + +// If k = 0 then return; the return in x0 is indeed 0 mod 10 = 0 + + cbz k, bignum_divmod10_end + +// Straightforward top-down loop doing 10 * q + r' := 2^64 * r + d + + mov r, xzr + mov w, 0x3333333333333333 + add s, w, 1 + and w, w, 0xfffffff +bignum_divmod10_divloop: + sub k, k, 1 + ldr d, [z, k, lsl #3] + +// First re-split and shift so 2^28 * h + l = (2^64 * r + d) / 2 +// Then (2^64 * r + d) / 10 = [(2^28 - 1) / 5] * h + (h + l) / 5 + + extr h, r, d, 29 + ubfx l, d, 1, 28 + + add l, h, l + + mul h, h, w + umulh l, l, s + add q, h, l + str q, [z, k, lsl #3] + +// Generate the new remainder r = d - 10 * q +// Since r <= 9 we only need the low part computation ignoring carries + + add q, q, q, lsl #2 + sub r, d, q, lsl #1 + + cbnz k, bignum_divmod10_divloop + +// Return the final remainder + + mov x0, r + +bignum_divmod10_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_emontredc.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_emontredc.S new file mode 100644 index 00000000000..ebd6a364549 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_emontredc.S @@ -0,0 +1,109 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended Montgomery reduce, returning results in input-output buffer +// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] +// +// extern uint64_t bignum_emontredc +// (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w); +// +// Assumes that z initially holds a 2k-digit bignum z_0, m is a k-digit odd +// bignum and m * w == -1 (mod 2^64). This function also uses z for the output +// as well as returning a carry c of 0 or 1. This encodes two numbers: in the +// lower half of the z buffer we have q = z[0..k-1], while the upper half +// together with the carry gives r = 2^{64k}*c + z[k..2k-1]. These values +// satisfy z_0 + q * m = 2^{64k} * r, i.e. r gives a raw (unreduced) Montgomery +// reduction while q gives the multiplier that was used. Another way of +// thinking of it is that if z' is the output z with the lower half replaced +// with zeros, then z_0 + q * m = 2^{128k} * c + z'. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = w, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc) + .text + .balign 4 + +#define k x0 +#define z x1 +#define m x2 +#define w x3 + +// Outer loop counter +#define i x4 +// Inner loop counter +#define j x5 +// Home for Montgomery multiplier +#define d x6 + +// Top carry for current window +#define c x7 + +#define h x8 +#define e x9 +#define l x10 +#define a x11 + + +S2N_BN_SYMBOL(bignum_emontredc): + +// If k = 0 the whole operation is trivial; note we also get a return of c = 0 + + cbz k, bignum_emontredc_end + +// Initialize top carry to zero, and launch into the outer loop + + mov c, xzr + mov i, xzr +bignum_emontredc_outerloop: + + ldr e, [z] + mul d, e, w + ldr a, [m] + mul l, d, a + umulh h, d, a + str d, [z] + adds xzr, e, l // Will be zero but want the carry + mov j, #1 + sub a, k, #1 + cbz a, bignum_emontredc_montend +bignum_emontredc_montloop: + ldr a, [m, j, lsl #3] + ldr e, [z, j, lsl #3] + mul l, d, a + adcs e, e, h + umulh h, d, a + adc h, h, xzr + adds e, e, l + str e, [z, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_emontredc_montloop +bignum_emontredc_montend: + adcs h, h, c + adc c, xzr, xzr + ldr a, [z, k, lsl #3] + adds h, h, a + adc c, c, xzr + str h, [z, k, lsl #3] + +// End of outer loop + + add z, z, #8 // For simple indexing, z pointer moves + add i, i, #1 + cmp i, k + bcc bignum_emontredc_outerloop + +// Return c in X0 + + mov x0, c + +bignum_emontredc_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_eq.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_eq.S new file mode 100644 index 00000000000..91efacc1511 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_eq.S @@ -0,0 +1,82 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Test bignums for equality, x = y +// Inputs x[m], y[n]; output function return +// +// extern uint64_t bignum_eq +// (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Standard ARM ABI: X0 = m, X1 = x, X2 = n, X3 = y, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_eq) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_eq) + .text + .balign 4 + +#define m x0 +#define x x1 +#define n x2 +#define y x3 +#define a x4 +#define c x5 +// We can re-use n for this, not needed when d appears +#define d x2 + + +S2N_BN_SYMBOL(bignum_eq): + +// Initialize the accumulated OR of differences to zero + + mov c, xzr + +// If m >= n jump into the m > n loop at the final equality test +// This will drop through for m = n + + cmp m, n + bcs bignum_eq_mtest + +// Toploop for the case n > m + +bignum_eq_nloop: + sub n, n, #1 + ldr a, [y, n, lsl #3] + orr c, c, a + cmp m, n + bne bignum_eq_nloop + b bignum_eq_mmain + +// Toploop for the case m > n (or n = m which enters at "mtest") + +bignum_eq_mloop: + sub m, m, #1 + ldr a, [x, m, lsl #3] + orr c, c, a + cmp m, n +bignum_eq_mtest: + bne bignum_eq_mloop + +// Combined main loop for the min(m,n) lower words + +bignum_eq_mmain: + cbz m, bignum_eq_end + +bignum_eq_loop: + sub m, m, #1 + ldr a, [x, m, lsl #3] + ldr d, [y, m, lsl #3] + eor a, a, d + orr c, c, a + cbnz m, bignum_eq_loop + +bignum_eq_end: + cmp c, xzr + cset x0, eq + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_even.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_even.S new file mode 100644 index 00000000000..16700254ebd --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_even.S @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Test bignum for even-ness +// Input x[k]; output function return +// +// extern uint64_t bignum_even (uint64_t k, uint64_t *x); +// +// Standard ARM ABI: X0 = k, X1 = x, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_even) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_even) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_even): + +cbz x0, bignum_even_end // if k = 0, that's the return! + ldr x0, [x1] + and x0, x0, #1 + +bignum_even_end: + eor x0, x0, #1 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/generic/bignum_ge.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ge.S similarity index 100% rename from third_party/s2n-bignum/arm/generic/bignum_ge.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_ge.S diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_gt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_gt.S new file mode 100644 index 00000000000..343d53fb119 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_gt.S @@ -0,0 +1,89 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compare bignums, x > y +// Inputs x[m], y[n]; output function return +// +// extern uint64_t bignum_gt +// (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Standard ARM ABI: X0 = m, X1 = x, X2 = n, X3 = y, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_gt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_gt) + .text + .balign 4 + +#define m x0 +#define x x1 +#define n x2 +#define y x3 +#define i x4 +#define a x5 +#define d x6 + + +S2N_BN_SYMBOL(bignum_gt): + +// Zero the main index counter for both branches + + mov i, xzr + +// Speculatively form n := n - m and do case split + + subs n, n, m + bcc bignum_gt_ylonger + +// The case where y is longer or of the same size (n >= m) +// Note that CF=1 initially by the fact that we reach this point + + cbz m, bignum_gt_xtest +bignum_gt_xmainloop: + ldr a, [y, i, lsl #3] + ldr d, [x, i, lsl #3] + sbcs xzr, a, d + add i, i, #1 + sub m, m, #1 + cbnz m, bignum_gt_xmainloop +bignum_gt_xtest: + cbz n, bignum_gt_xskip +bignum_gt_xtoploop: + ldr a, [y, i, lsl #3] + sbcs xzr, a, xzr + add i, i, #1 + sub n, n, #1 + cbnz n, bignum_gt_xtoploop +bignum_gt_xskip: + cset x0, cc + ret + +// The case where x is longer (m > n) +// The first "adds" also makes sure CF=1 initially in this branch + +bignum_gt_ylonger: + adds n, n, m + cbz n, bignum_gt_ytoploop + sub m, m, n +bignum_gt_ymainloop: + ldr a, [y, i, lsl #3] + ldr d, [x, i, lsl #3] + sbcs xzr, a, d + add i, i, #1 + sub n, n, #1 + cbnz n, bignum_gt_ymainloop +bignum_gt_ytoploop: + ldr a, [x, i, lsl #3] + sbcs xzr, xzr, a + add i, i, #1 + sub m, m, #1 + cbnz m, bignum_gt_ytoploop + + cset x0, cc + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_iszero.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_iszero.S new file mode 100644 index 00000000000..c65c026de3b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_iszero.S @@ -0,0 +1,43 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Test bignum for zero-ness, x = 0 +// Input x[k]; output function return +// +// extern uint64_t bignum_iszero (uint64_t k, uint64_t *x); +// +// Standard ARM ABI: X0 = k, X1 = x, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_iszero) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_iszero) + .text + .balign 4 + +#define k x0 +#define x x1 +#define a x2 +#define c x3 + + +S2N_BN_SYMBOL(bignum_iszero): + +mov c, xzr // c will be or of the digits + cbz k, bignum_iszero_end // if k = 0 skip the bignum_iszero_loop + +bignum_iszero_loop: + sub k, k, #1 + ldr a, [x, k, lsl #3] + orr c, c, a + cbnz k, bignum_iszero_loop + +bignum_iszero_end: + cmp c, xzr + cset x0, eq + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_le.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_le.S new file mode 100644 index 00000000000..243f81c99f9 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_le.S @@ -0,0 +1,89 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compare bignums, x <= y +// Inputs x[m], y[n]; output function return +// +// extern uint64_t bignum_le +// (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Standard ARM ABI: X0 = m, X1 = x, X2 = n, X3 = y, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_le) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_le) + .text + .balign 4 + +#define m x0 +#define x x1 +#define n x2 +#define y x3 +#define i x4 +#define a x5 +#define d x6 + + +S2N_BN_SYMBOL(bignum_le): + +// Zero the main index counter for both branches + + mov i, xzr + +// Speculatively form n := n - m and do case split + + subs n, n, m + bcc bignum_le_ylonger + +// The case where y is longer or of the same size (n >= m) +// Note that CF=1 initially by the fact that we reach this point + + cbz m, bignum_le_xtest +bignum_le_xmainloop: + ldr a, [y, i, lsl #3] + ldr d, [x, i, lsl #3] + sbcs xzr, a, d + add i, i, #1 + sub m, m, #1 + cbnz m, bignum_le_xmainloop +bignum_le_xtest: + cbz n, bignum_le_xskip +bignum_le_xtoploop: + ldr a, [y, i, lsl #3] + sbcs xzr, a, xzr + add i, i, #1 + sub n, n, #1 + cbnz n, bignum_le_xtoploop +bignum_le_xskip: + cset x0, cs + ret + +// The case where x is longer (m > n) +// The first "adds" also makes sure CF=1 initially in this branch + +bignum_le_ylonger: + adds n, n, m + cbz n, bignum_le_ytoploop + sub m, m, n +bignum_le_ymainloop: + ldr a, [y, i, lsl #3] + ldr d, [x, i, lsl #3] + sbcs xzr, a, d + add i, i, #1 + sub n, n, #1 + cbnz n, bignum_le_ymainloop +bignum_le_ytoploop: + ldr a, [x, i, lsl #3] + sbcs xzr, xzr, a + add i, i, #1 + sub m, m, #1 + cbnz m, bignum_le_ytoploop + + cset x0, cs + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_lt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_lt.S new file mode 100644 index 00000000000..554bfec2aa6 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_lt.S @@ -0,0 +1,89 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compare bignums, x < y +// Inputs x[m], y[n]; output function return +// +// extern uint64_t bignum_lt +// (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Standard ARM ABI: X0 = m, X1 = x, X2 = n, X3 = y, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_lt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_lt) + .text + .balign 4 + +#define m x0 +#define x x1 +#define n x2 +#define y x3 +#define i x4 +#define a x5 +#define d x6 + + +S2N_BN_SYMBOL(bignum_lt): + +// Zero the main index counter for both branches + + mov i, xzr + +// Speculatively form m := m - n and do case split + + subs m, m, n + bcc bignum_lt_ylonger + +// The case where x is longer or of the same size (m >= n) +// Note that CF=1 initially by the fact that we reach this point + + cbz n, bignum_lt_xtest +bignum_lt_xmainloop: + ldr a, [x, i, lsl #3] + ldr d, [y, i, lsl #3] + sbcs xzr, a, d + add i, i, #1 + sub n, n, #1 + cbnz n, bignum_lt_xmainloop +bignum_lt_xtest: + cbz m, bignum_lt_xskip +bignum_lt_xtoploop: + ldr a, [x, i, lsl #3] + sbcs xzr, a, xzr + add i, i, #1 + sub m, m, #1 + cbnz m, bignum_lt_xtoploop +bignum_lt_xskip: + cset x0, cc + ret + +// The case where y is longer (n > m) +// The first "adds" also makes sure CF=1 initially in this branch + +bignum_lt_ylonger: + adds m, m, n + cbz m, bignum_lt_ytoploop + sub n, n, m +bignum_lt_ymainloop: + ldr a, [x, i, lsl #3] + ldr d, [y, i, lsl #3] + sbcs xzr, a, d + add i, i, #1 + sub m, m, #1 + cbnz m, bignum_lt_ymainloop +bignum_lt_ytoploop: + ldr a, [y, i, lsl #3] + sbcs xzr, xzr, a + add i, i, #1 + sub n, n, #1 + cbnz n, bignum_lt_ytoploop + + cset x0, cc + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_madd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_madd.S new file mode 100644 index 00000000000..b956e7c7f6e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_madd.S @@ -0,0 +1,124 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply-add, z := z + x * y +// Inputs x[m], y[n]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_madd +// (uint64_t k, uint64_t *z, +// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Does the "z := x * y + z" operation, while also returning a "next" or +// "carry" word. In the case where m + n <= p (i.e. the pure product would +// fit in the destination) this is the remainder for the exact result. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = x, X4 = n, X5 = y, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_madd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_madd) + .text + .balign 4 + +#define p x0 +#define z x1 +#define m x2 +#define x x3 +#define n x4 +#define y x5 +#define l x6 +#define h x7 +#define c x8 +#define k x9 +#define i x10 +#define a x11 +#define b x12 +#define d x13 +#define xx x14 +#define yy x15 + + +S2N_BN_SYMBOL(bignum_madd): + +// If p = 0 the result is trivial and nothing needs doing +// Note that fortuitously our "carry/remainder" term is still right! +// As it's a multiply-add, could also do this if either argument is trivial + + cbz p, bignum_madd_end + +// initialize (h,l) = 0, saving c = 0 for inside the loop + + mov l, xzr + mov h, xzr + +// Iterate outer loop from k = 0 ... k = p - 1 producing result digits + + mov k, xzr +bignum_madd_outerloop: + +// Add the existing z[k] and (h,l) to get initial (c,h,l) combination + + ldr c, [z, k, lsl #3] + adds l, l, c + adcs h, h, xzr + adc c, xzr, xzr + +// First let a = MAX 0 (k + 1 - n) and b = MIN (k + 1) m +// We want to accumulate all x[i] * y[k - i] for a <= i < b + + add a, k, #1 + cmp a, m + csel b, a, m, cc + subs a, a, n + csel a, a, xzr, cs + +// Set loop count i = b - a, and skip everything if it's <= 0 + + subs i, b, a + bls bignum_madd_innerend + +// Use temporary pointers xx = x + 8 * a and yy = y + 8 * (k - b) +// Increment xx per iteration but just use loop counter with yy +// So we start with [xx] = x[a] and [yy] = y[(k - b) + (b - a)] = y[k - a] + + lsl xx, a, #3 + add xx, xx, x + + sub yy, k, b + lsl yy, yy, #3 + add yy, yy, y + +// And index using the loop counter i = b - a, ..., i = 1 + +bignum_madd_innerloop: + ldr a, [xx], #8 + ldr b, [yy, i, lsl #3] + mul d, a, b + umulh a, a, b + adds l, l, d + adcs h, h, a + adc c, c, xzr + subs i, i, #1 + bne bignum_madd_innerloop + +bignum_madd_innerend: + str l, [z, k, lsl #3] + mov l, h + mov h, c + + add k, k, #1 + cmp k, p + bcc bignum_madd_outerloop // Inverted carry flag! + +// Return the "carry/remainder" term + + mov x0, l + +bignum_madd_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modadd.S new file mode 100644 index 00000000000..d23d1e101ed --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modadd.S @@ -0,0 +1,83 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Add modulo m, z := (x + y) mod m, assuming x and y reduced +// Inputs x[k], y[k], m[k]; output z[k] +// +// extern void bignum_modadd +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = y, X4 = m +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modadd) + .text + .balign 4 + +#define k x0 +#define z x1 +#define x x2 +#define y x3 +#define m x4 +#define i x5 +#define j x6 +#define a x7 +#define b x8 +#define c x9 + + +S2N_BN_SYMBOL(bignum_modadd): + +adds j, k, xzr // j = k and ZF = (k = 0) + beq bignum_modadd_end // if k = 0 do nothing + adds i, xzr, xzr // i = 0 and CF = 0 + +// First just add (c::z) := x + y + +bignum_modadd_addloop: + ldr a, [x, i] + ldr b, [y, i] + adcs a, a, b + str a, [z, i] + add i, i, #8 + sub j, j, #1 + cbnz j, bignum_modadd_addloop + cset c, cs + +// Now do a comparison subtraction (c::z) - m, recording mask for (c::z) >= m + + mov j, k + subs i, xzr, xzr +bignum_modadd_cmploop: + ldr a, [z, i] + ldr b, [m, i] + sbcs xzr, a, b + add i, i, #8 + sub j, j, #1 + cbnz j, bignum_modadd_cmploop + sbcs c, c, xzr + mvn c, c + +// Now do a masked subtraction z := z - [c] * m + + mov j, k + subs i, xzr, xzr +bignum_modadd_subloop: + ldr a, [z, i] + ldr b, [m, i] + and b, b, c + sbcs a, a, b + str a, [z, i] + add i, i, #8 + sub j, j, #1 + cbnz j, bignum_modadd_subloop + +bignum_modadd_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_moddouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_moddouble.S new file mode 100644 index 00000000000..286e5ea43eb --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_moddouble.S @@ -0,0 +1,72 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Double modulo m, z := (2 * x) mod m, assuming x reduced +// Inputs x[k], m[k]; output z[k] +// +// extern void bignum_moddouble +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = m +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_moddouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_moddouble) + .text + .balign 4 + +#define k x0 +#define z x1 +#define x x2 +#define m x3 +#define i x4 +#define j x5 +#define a x6 +#define b x7 +#define c x8 + + +S2N_BN_SYMBOL(bignum_moddouble): + +adds j, k, xzr // j = k and ZF = (k = 0) + beq bignum_moddouble_end // if k = 0 do nothing + +// Do (_::z) = 2 * x - m and generate a mask in c for 2 * x < m + + mov c, xzr + subs i, xzr, xzr // i = 0 and CF = 1 +bignum_moddouble_dubloop: + ldr a, [x, i] + extr c, a, c, #63 + ldr b, [m, i] + sbcs c, c, b + str c, [z, i] + mov c, a + add i, i, #8 + sub j, j, #1 + cbnz j, bignum_moddouble_dubloop + lsr c, c, #63 + sbc c, c, xzr + +// Now do a corrective masked addition z := z + [c] * m + + mov j, k + adds i, xzr, xzr +bignum_moddouble_corrloop: + ldr a, [z, i] + ldr b, [m, i] + and b, b, c + adcs a, a, b + str a, [z, i] + add i, i, #8 + sub j, j, #1 + cbnz j, bignum_moddouble_corrloop + +bignum_moddouble_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modexp.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modexp.S new file mode 100644 index 00000000000..761b6e64f30 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modexp.S @@ -0,0 +1,540 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular exponentiation for arbitrary odd modulus +// Inputs a[k], p[k], m[k]; output z[k], temporary buffer t[>=3*k] +// +// extern void bignum_modexp +// (uint64_t k,uint64_t *z, uint64_t *a,uint64_t *p,uint64_t *m,uint64_t *t); +// +// Does z := (a^p) mod m where all numbers are k-digit and m is odd +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = a, X3 = p, X4 = m, X5 = t +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modexp) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modexp) + .text + .balign 4 + +// Local variables, all held in extra registers + +#define k x19 +#define res x20 +#define a x21 +#define p x22 +#define m x23 +#define x x24 +#define i x25 +#define y x, k, lsl #3 +#define z x, k, lsl #4 + +S2N_BN_SYMBOL(bignum_modexp): + +// Save some registers including link register + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x30, [sp, #-16]! + +// If size is zero (which falsifies the oddness condition) do nothing + + cbz x0, bignum_modexp_end + +// Move parameters into permanent homes + + mov k, x0 + mov res, x1 + mov a, x2 + mov p, x3 + mov m, x4 + mov x, x5 + +// Let x == 2^64k * a (mod m) and initialize z == 2^64k * 1 (mod m) + + mov x0, k + add x1, z + mov x2, m + add x3, y + bl bignum_modexp_local_amontifier + + mov x0, k + mov x1, x + add x2, z + mov x3, a + mov x4, m + bl bignum_modexp_local_amontmul + + mov x0, k + add x1, z + add x2, z + mov x3, m + bl bignum_modexp_local_demont + +// Main loop with z == 2^64k * a^(p >> 2^i) (mod m) + + lsl i, k, #6 + +bignum_modexp_loop: + sub i, i, #1 + + mov x0, k + add x1, y + add x2, z + add x3, z + mov x4, m + bl bignum_modexp_local_amontmul + + mov x0, k + add x1, z + mov x2, x + add x3, y + mov x4, m + bl bignum_modexp_local_amontmul + + lsr x0, i, #6 + ldr x0, [p, x0, lsl #3] + lsr x0, x0, i + and x0, x0, #1 + + mov x1, k + add x2, z + add x3, z + add x4, y + bl bignum_modexp_local_mux + + cbnz i, bignum_modexp_loop + +// Convert back from Montgomery representation and copy the result +// (via a degenerate case of multiplexing) into the output buffer + + mov x0, k + add x1, z + add x2, z + mov x3, m + bl bignum_modexp_local_demont + + mov x0, xzr + mov x1, k + mov x2, res + add x3, z + add x4, z + bl bignum_modexp_local_mux + +// Restore registers and return + +bignum_modexp_end: + + ldp x25, x30, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// Local copy of bignum_amontifier + +bignum_modexp_local_amontifier: + cbz x0, bignum_modexp_amontifend + mov x4, xzr +bignum_modexp_copyinloop: + ldr x9, [x2, x4, lsl #3] + str x9, [x3, x4, lsl #3] + add x4, x4, #0x1 + cmp x4, x0 + b.cc bignum_modexp_copyinloop + subs x4, x0, #0x1 + b.eq bignum_modexp_normalized +bignum_modexp_normloop: + mov x5, xzr + cmp x9, xzr + mov x7, xzr +bignum_modexp_shufloop: + mov x9, x7 + ldr x7, [x3, x5, lsl #3] + csel x9, x9, x7, eq + str x9, [x3, x5, lsl #3] + add x5, x5, #0x1 + sub x11, x5, x0 + cbnz x11, bignum_modexp_shufloop + subs x4, x4, #0x1 + b.ne bignum_modexp_normloop +bignum_modexp_normalized: + clz x9, x9 + mov x10, xzr + mov x4, xzr + tst x9, #0x3f + csetm x8, ne + neg x11, x9 +bignum_modexp_bitloop: + ldr x5, [x3, x4, lsl #3] + lsl x7, x5, x9 + orr x7, x7, x10 + lsr x10, x5, x11 + and x10, x10, x8 + str x7, [x3, x4, lsl #3] + add x4, x4, #0x1 + cmp x4, x0 + b.cc bignum_modexp_bitloop + sub x6, x0, #0x1 + ldr x6, [x3, x6, lsl #3] + mov x11, #0x1 + neg x10, x6 + mov x4, #0x3e +bignum_modexp_estloop: + add x11, x11, x11 + mov x7, x6 + sub x7, x7, x10 + cmp x10, x7 + csetm x7, cs + sub x11, x11, x7 + add x10, x10, x10 + and x7, x7, x6 + sub x10, x10, x7 + subs x4, x4, #0x1 + b.ne bignum_modexp_estloop + cmp x10, x6 + cinc x11, x11, eq + mov x9, xzr + adds x4, xzr, xzr +bignum_modexp_mulloop: + ldr x7, [x3, x4, lsl #3] + mul x8, x11, x7 + adcs x8, x8, x9 + umulh x9, x11, x7 + str x8, [x1, x4, lsl #3] + add x4, x4, #0x1 + sub x7, x4, x0 + cbnz x7, bignum_modexp_mulloop + adc x9, x9, xzr + mov x7, #0x4000000000000000 + subs x9, x9, x7 + csetm x11, cs + negs x4, xzr +bignum_modexp_remloop: + ldr x7, [x3, x4, lsl #3] + ldr x10, [x1, x4, lsl #3] + and x7, x7, x11 + sbcs x7, x7, x10 + str x7, [x1, x4, lsl #3] + add x4, x4, #0x1 + sub x7, x4, x0 + cbnz x7, bignum_modexp_remloop + mov x9, xzr + negs x5, xzr +bignum_modexp_dubloop1: + ldr x7, [x1, x5, lsl #3] + extr x9, x7, x9, #63 + ldr x10, [x3, x5, lsl #3] + sbcs x9, x9, x10 + str x9, [x1, x5, lsl #3] + mov x9, x7 + add x5, x5, #0x1 + sub x7, x5, x0 + cbnz x7, bignum_modexp_dubloop1 + lsr x9, x9, #63 + sbc x9, x9, xzr + adds x5, xzr, xzr +bignum_modexp_corrloop1: + ldr x7, [x1, x5, lsl #3] + ldr x10, [x3, x5, lsl #3] + and x10, x10, x9 + adcs x7, x7, x10 + str x7, [x1, x5, lsl #3] + add x5, x5, #0x1 + sub x7, x5, x0 + cbnz x7, bignum_modexp_corrloop1 + mov x9, xzr + negs x5, xzr +bignum_modexp_dubloop2: + ldr x7, [x1, x5, lsl #3] + extr x9, x7, x9, #63 + ldr x10, [x3, x5, lsl #3] + sbcs x9, x9, x10 + str x9, [x1, x5, lsl #3] + mov x9, x7 + add x5, x5, #0x1 + sub x7, x5, x0 + cbnz x7, bignum_modexp_dubloop2 + lsr x9, x9, #63 + sbc x9, x9, xzr + adds x5, xzr, xzr +bignum_modexp_corrloop2: + ldr x7, [x1, x5, lsl #3] + ldr x10, [x3, x5, lsl #3] + and x10, x10, x9 + adcs x7, x7, x10 + str x7, [x1, x5, lsl #3] + str x7, [x3, x5, lsl #3] + add x5, x5, #0x1 + sub x7, x5, x0 + cbnz x7, bignum_modexp_corrloop2 + mov x6, xzr + mov x4, x0 +bignum_modexp_modloop: + mov x5, xzr + mov x10, xzr + adds x9, xzr, xzr +bignum_modexp_cmaloop: + ldr x7, [x1, x5, lsl #3] + mul x8, x6, x7 + adcs x10, x10, x9 + umulh x9, x6, x7 + adc x9, x9, xzr + adds x8, x10, x8 + ldr x10, [x3, x5, lsl #3] + str x8, [x3, x5, lsl #3] + add x5, x5, #0x1 + sub x7, x5, x0 + cbnz x7, bignum_modexp_cmaloop + adcs x6, x10, x9 + csetm x8, cs + adds x5, xzr, xzr +bignum_modexp_oaloop: + ldr x7, [x3, x5, lsl #3] + ldr x10, [x1, x5, lsl #3] + and x10, x10, x8 + adcs x7, x7, x10 + str x7, [x3, x5, lsl #3] + add x5, x5, #0x1 + sub x7, x5, x0 + cbnz x7, bignum_modexp_oaloop + adc x6, x6, xzr + subs x4, x4, #0x1 + b.ne bignum_modexp_modloop + ldr x7, [x2] + lsl x11, x7, #2 + sub x11, x7, x11 + eor x11, x11, #0x2 + mov x8, #0x1 + madd x9, x7, x11, x8 + mul x10, x9, x9 + madd x11, x9, x11, x11 + mul x9, x10, x10 + madd x11, x10, x11, x11 + mul x10, x9, x9 + madd x11, x9, x11, x11 + madd x11, x10, x11, x11 + ldr x10, [x3] + mul x11, x10, x11 + mul x8, x11, x7 + umulh x9, x11, x7 + mov x5, #0x1 + sub x7, x0, #0x1 + cmn x10, x8 + cbz x7, bignum_modexp_montifend +bignum_modexp_montifloop: + ldr x7, [x2, x5, lsl #3] + ldr x10, [x3, x5, lsl #3] + mul x8, x11, x7 + adcs x10, x10, x9 + umulh x9, x11, x7 + adc x9, x9, xzr + adds x10, x10, x8 + sub x7, x5, #0x1 + str x10, [x3, x7, lsl #3] + add x5, x5, #0x1 + sub x7, x5, x0 + cbnz x7, bignum_modexp_montifloop +bignum_modexp_montifend: + adcs x6, x6, x9 + csetm x8, cs + sub x7, x0, #0x1 + str x6, [x3, x7, lsl #3] + negs x5, xzr +bignum_modexp_osloop: + ldr x7, [x3, x5, lsl #3] + ldr x10, [x2, x5, lsl #3] + and x10, x10, x8 + sbcs x7, x7, x10 + str x7, [x1, x5, lsl #3] + add x5, x5, #0x1 + sub x7, x5, x0 + cbnz x7, bignum_modexp_osloop +bignum_modexp_amontifend: + ret + +// Local copy of bignum_amontmul + +bignum_modexp_local_amontmul: + cbz x0, bignum_modexp_amomend + ldr x14, [x4] + lsl x5, x14, #2 + sub x5, x14, x5 + eor x5, x5, #0x2 + mov x6, #0x1 + madd x6, x14, x5, x6 + mul x7, x6, x6 + madd x5, x6, x5, x5 + mul x6, x7, x7 + madd x5, x7, x5, x5 + mul x7, x6, x6 + madd x5, x6, x5, x5 + madd x5, x7, x5, x5 + mov x8, xzr +bignum_modexp_zoop: + str xzr, [x1, x8, lsl #3] + add x8, x8, #0x1 + cmp x8, x0 + b.cc bignum_modexp_zoop + mov x6, xzr + mov x8, xzr +bignum_modexp_outerloop: + ldr x9, [x2, x8, lsl #3] + mov x10, xzr + adds x11, xzr, xzr +bignum_modexp_maddloop: + ldr x14, [x3, x10, lsl #3] + ldr x12, [x1, x10, lsl #3] + mul x13, x9, x14 + adcs x12, x12, x11 + umulh x11, x9, x14 + adc x11, x11, xzr + adds x12, x12, x13 + str x12, [x1, x10, lsl #3] + add x10, x10, #0x1 + sub x14, x10, x0 + cbnz x14, bignum_modexp_maddloop + adcs x6, x6, x11 + adc x7, xzr, xzr + ldr x12, [x1] + mul x9, x12, x5 + ldr x14, [x4] + mul x13, x9, x14 + umulh x11, x9, x14 + adds x12, x12, x13 + mov x10, #0x1 + sub x14, x0, #0x1 + cbz x14, bignum_modexp_montend +bignum_modexp_montloop: + ldr x14, [x4, x10, lsl #3] + ldr x12, [x1, x10, lsl #3] + mul x13, x9, x14 + adcs x12, x12, x11 + umulh x11, x9, x14 + adc x11, x11, xzr + adds x12, x12, x13 + sub x13, x10, #0x1 + str x12, [x1, x13, lsl #3] + add x10, x10, #0x1 + sub x14, x10, x0 + cbnz x14, bignum_modexp_montloop +bignum_modexp_montend: + adcs x11, x6, x11 + adc x6, x7, xzr + sub x13, x10, #0x1 + str x11, [x1, x13, lsl #3] + add x8, x8, #0x1 + cmp x8, x0 + b.cc bignum_modexp_outerloop + neg x6, x6 + negs x10, xzr +bignum_modexp_corrloop3: + ldr x14, [x1, x10, lsl #3] + ldr x12, [x4, x10, lsl #3] + and x12, x12, x6 + sbcs x14, x14, x12 + str x14, [x1, x10, lsl #3] + add x10, x10, #0x1 + sub x14, x10, x0 + cbnz x14, bignum_modexp_corrloop3 +bignum_modexp_amomend: + ret + +// Local copy of bignum_demont + +bignum_modexp_local_demont: + cbz x0, bignum_modexp_demontend + ldr x11, [x3] + lsl x4, x11, #2 + sub x4, x11, x4 + eor x4, x4, #0x2 + mov x5, #0x1 + madd x5, x11, x4, x5 + mul x6, x5, x5 + madd x4, x5, x4, x4 + mul x5, x6, x6 + madd x4, x6, x4, x4 + mul x6, x5, x5 + madd x4, x5, x4, x4 + madd x4, x6, x4, x4 + mov x5, xzr +bignum_modexp_iloop: + ldr x11, [x2, x5, lsl #3] + str x11, [x1, x5, lsl #3] + add x5, x5, #0x1 + cmp x5, x0 + b.cc bignum_modexp_iloop + mov x5, xzr +bignum_modexp_douterloop: + ldr x9, [x1] + mul x7, x9, x4 + ldr x11, [x3] + mul x10, x7, x11 + umulh x8, x7, x11 + adds x9, x9, x10 + mov x6, #0x1 + sub x11, x0, #0x1 + cbz x11, bignum_modexp_dmontend +bignum_modexp_dmontloop: + ldr x11, [x3, x6, lsl #3] + ldr x9, [x1, x6, lsl #3] + mul x10, x7, x11 + adcs x9, x9, x8 + umulh x8, x7, x11 + adc x8, x8, xzr + adds x9, x9, x10 + sub x10, x6, #0x1 + str x9, [x1, x10, lsl #3] + add x6, x6, #0x1 + sub x11, x6, x0 + cbnz x11, bignum_modexp_dmontloop +bignum_modexp_dmontend: + adc x8, xzr, x8 + sub x10, x6, #0x1 + str x8, [x1, x10, lsl #3] + add x5, x5, #0x1 + cmp x5, x0 + b.cc bignum_modexp_douterloop + negs x6, xzr +bignum_modexp_cmploop: + ldr x11, [x1, x6, lsl #3] + ldr x9, [x3, x6, lsl #3] + sbcs xzr, x11, x9 + add x6, x6, #0x1 + sub x11, x6, x0 + cbnz x11, bignum_modexp_cmploop + csetm x8, cs + negs x6, xzr +bignum_modexp_corrloop: + ldr x11, [x1, x6, lsl #3] + ldr x9, [x3, x6, lsl #3] + and x9, x9, x8 + sbcs x11, x11, x9 + str x11, [x1, x6, lsl #3] + add x6, x6, #0x1 + sub x11, x6, x0 + cbnz x11, bignum_modexp_corrloop +bignum_modexp_demontend: + ret + +// Local copy of bignum_mux + +bignum_modexp_local_mux: + cbz x1, bignum_modexp_muxend + cmp x0, #0x0 +bignum_modexp_muxloop: + sub x1, x1, #0x1 + ldr x5, [x3, x1, lsl #3] + ldr x0, [x4, x1, lsl #3] + csel x5, x5, x0, ne + str x5, [x2, x1, lsl #3] + cbnz x1, bignum_modexp_muxloop +bignum_modexp_muxend: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modifier.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modifier.S new file mode 100644 index 00000000000..312293274f8 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modifier.S @@ -0,0 +1,458 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compute "modification" constant z := 2^{64k} mod m +// Input m[k]; output z[k]; temporary buffer t[>=k] +// +// extern void bignum_modifier +// (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t); +// +// The last argument points to a temporary buffer t that should have size >= k. +// This is called "mod-ifier" because given any other k-digit number x we can +// get x MOD m simply and reasonably efficiently just by Montgomery +// multiplication of x and z. But one can also consider it the identity for +// Montgomery multiplication, assuming you have a reduced multiplier already. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = t +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modifier) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modifier) + .text + .balign 4 + +#define k x0 +#define z x1 +#define m x2 +#define t x3 + +// Some variables +// Modular inverse w is aliased to i, but we never use them together + +#define i x4 +#define w x4 +#define j x5 +#define h x6 +#define a x7 +#define l x8 +#define c x9 +#define b x10 +#define d x11 + +// Some aliases for the values b and d + +#define r x10 +#define q x11 + + +S2N_BN_SYMBOL(bignum_modifier): + +// If k = 0 the whole operation is trivial + + cbz k, bignum_modifier_end + +// Copy the input m into the temporary buffer t. The temporary register +// c matters since we want it to hold the highest digit, ready for the +// normalization phase. + + mov i, xzr +bignum_modifier_copyinloop: + ldr c, [m, i, lsl #3] + str c, [t, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_modifier_copyinloop + +// Do a rather stupid but constant-time digit normalization, conditionally +// shifting left (k-1) times based on whether the top word is zero. +// With careful binary striding this could be O(k*log(k)) instead of O(k^2) +// while still retaining the constant-time style. +// The "cmp c, xzr" sets the zeroness predicate (ZF) for the entire inner loop + + subs i, k, #1 + beq bignum_modifier_normalized +bignum_modifier_normloop: + mov j, xzr + cmp c, xzr + mov a, xzr +bignum_modifier_shufloop: + mov c, a + ldr a, [t, j, lsl #3] + csel c, c, a, eq + str c, [t, j, lsl #3] + add j, j, #1 + sub d, j, k + cbnz d, bignum_modifier_shufloop + subs i, i, #1 + bne bignum_modifier_normloop + +// We now have the top digit nonzero, assuming the input was nonzero, +// and as per the invariant of the loop above, c holds that digit. So +// now just count c's leading zeros and shift t bitwise that many bits. + +bignum_modifier_normalized: + clz c, c + + mov b, xzr + mov i, xzr + ands xzr, c, #63 + csetm l, ne + neg d, c +bignum_modifier_bitloop: + ldr j, [t, i, lsl #3] + lsl a, j, c + orr a, a, b + lsr b, j, d + and b, b, l + str a, [t, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_modifier_bitloop + +// Let h be the high word of n, which in all the in-scope cases is >= 2^63. +// Now successively form q = 2^i div h and r = 2^i mod h as i goes from +// 64 to 126. We avoid just using division out of constant-time concerns +// (at the least we would need to fix up h = 0 for out-of-scope inputs) and +// don't bother with Newton-Raphson, since this stupid simple loop doesn't +// contribute much of the overall runtime at typical sizes. + + sub h, k, #1 + ldr h, [t, h, lsl #3] + mov q, #1 + neg r, h + mov i, #62 +bignum_modifier_estloop: + add q, q, q + mov a, h + sub a, a, r + cmp r, a // CF <=> r >= h - r <=> 2 * r >= h + csetm a, cs + sub q, q, a + add r, r, r + and a, a, h + sub r, r, a + subs i, i, #1 + bne bignum_modifier_estloop + +// Strictly speaking the above loop doesn't quite give the true remainder +// and quotient in the special case r = h = 2^63, so fix it up. We get +// q = 2^63 - 1 and r = 2^63 and really want q = 2^63 and r = 0. This is +// supererogatory, because the main property of q used below still holds +// in this case unless the initial m = 1, and then anyway the overall +// specification (congruence modulo m) holds degenerately. But it seems +// nicer to get a "true" quotient and remainder. + + cmp r, h + csinc q, q, q, ne + +// So now we have q and r with 2^126 = q * h + r (imagining r = 0 in the +// fixed-up case above: note that we never actually use the computed +// value of r below and so didn't adjust it). And we can assume the ranges +// q <= 2^63 and r < h < 2^64. +// +// The idea is to use q as a first quotient estimate for a remainder +// of 2^{p+62} mod n, where p = 64 * k. We have, splitting n into the +// high and low parts h and l: +// +// 2^{p+62} - q * n = 2^{p+62} - q * (2^{p-64} * h + l) +// = 2^{p+62} - (2^{p-64} * (q * h) + q * l) +// = 2^{p+62} - 2^{p-64} * (2^126 - r) - q * l +// = 2^{p-64} * r - q * l +// +// Note that 2^{p-64} * r < 2^{p-64} * h <= n +// and also q * l < 2^63 * 2^{p-64} = 2^{p-1} <= n +// so |diff| = |2^{p-64} * r - q * l| < n. +// +// If in fact diff >= 0 then it is already 2^{p+62} mod n. +// otherwise diff + n is the right answer. +// +// To (maybe?) make the computation slightly easier we actually flip +// the sign and compute d = q * n - 2^{p+62}. Then the answer is either +// -d (when negative) or n - d; in either case we effectively negate d. +// This negating tweak in fact spoils the result for cases where +// 2^{p+62} mod n = 0, when we get n instead. However the only case +// where this can happen is m = 1, when the whole spec holds trivially, +// and actually the remainder of the logic below works anyway since +// the latter part of the code only needs a congruence for the k-digit +// result, not strict modular reduction (the doublings will maintain +// the non-strict inequality). + + mov c, xzr + adds i, xzr, xzr +bignum_modifier_mulloop: + ldr a, [t, i, lsl #3] + mul l, q, a + adcs l, l, c + umulh c, q, a + str l, [z, i, lsl #3] + add i, i, #1 + sub a, i, k + cbnz a, bignum_modifier_mulloop + + adc c, c, xzr + mov a, #0x4000000000000000 + subs c, c, a + csetm q, cs + +// Now do [c] * n - d for our final answer + + subs i, xzr, xzr +bignum_modifier_remloop: + ldr a, [t, i, lsl #3] + ldr b, [z, i, lsl #3] + and a, a, q + sbcs a, a, b + str a, [z, i, lsl #3] + add i, i, #1 + sub a, i, k + cbnz a, bignum_modifier_remloop + +// Now still need to do a couple of modular doublings to get us all the +// way up to 2^{p+64} == r from the initial 2^{p+62} == r (mod n). + + mov c, xzr + subs j, xzr, xzr +bignum_modifier_dubloop1: + ldr a, [z, j, lsl #3] + extr c, a, c, #63 + ldr b, [t, j, lsl #3] + sbcs c, c, b + str c, [z, j, lsl #3] + mov c, a + add j, j, #1 + sub a, j, k + cbnz a, bignum_modifier_dubloop1 + lsr c, c, #63 + sbc c, c, xzr + adds j, xzr, xzr +bignum_modifier_corrloop1: + ldr a, [z, j, lsl #3] + ldr b, [t, j, lsl #3] + and b, b, c + adcs a, a, b + str a, [z, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_modifier_corrloop1 + +// This is not exactly the same: we also copy output to t giving the +// initialization t_1 = r == 2^{p+64} mod n for the main loop next. + + mov c, xzr + subs j, xzr, xzr +bignum_modifier_dubloop2: + ldr a, [z, j, lsl #3] + extr c, a, c, #63 + ldr b, [t, j, lsl #3] + sbcs c, c, b + str c, [z, j, lsl #3] + mov c, a + add j, j, #1 + sub a, j, k + cbnz a, bignum_modifier_dubloop2 + lsr c, c, #63 + sbc c, c, xzr + adds j, xzr, xzr +bignum_modifier_corrloop2: + ldr a, [z, j, lsl #3] + ldr b, [t, j, lsl #3] + and b, b, c + adcs a, a, b + str a, [z, j, lsl #3] + str a, [t, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_modifier_corrloop2 + +// We then successively generate (k+1)-digit values satisfying +// t_i == 2^{p+64*i} mod n, each of which is stored in h::t. Finish +// initialization by zeroing h initially + + mov h, xzr + +// Then if t_i = 2^{p} * h + l +// we have t_{i+1} == 2^64 * t_i +// = (2^{p+64} * h) + (2^64 * l) +// == r * h + l<<64 +// Do this k more times so we end up == 2^{128*k+64}, one more than we want +// +// Writing B = 2^{64k}, the possible correction of adding r, which for +// a (k+1)-digit result is equivalent to subtracting q = 2^{64*(k+1)} - r +// would give the overall worst-case value minus q of +// [ B * (B^k - 1) + (B - 1) * r ] - [B^{k+1} - r] +// = B * (r - 1) < B^{k+1} so we keep inside k+1 digits as required. +// +// This implementation makes the shift implicit by starting b with the +// "previous" digit (initially 0) to offset things by 1. + + mov i, k +bignum_modifier_modloop: + mov j, xzr + mov b, xzr + adds c, xzr, xzr +bignum_modifier_cmaloop: + ldr a, [z, j, lsl #3] + mul l, h, a + adcs b, b, c + umulh c, h, a + adc c, c, xzr + adds l, b, l + ldr b, [t, j, lsl #3] + str l, [t, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_modifier_cmaloop + + adcs h, b, c + + csetm l, cs + + adds j, xzr, xzr +bignum_modifier_oaloop: + ldr a, [t, j, lsl #3] + ldr b, [z, j, lsl #3] + and b, b, l + adcs a, a, b + str a, [t, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_modifier_oaloop + adc h, h, xzr + + subs i, i, #1 + bne bignum_modifier_modloop + +// Compute the negated modular inverse w (same register as i, not used again). + + ldr a, [m] + lsl w, a, #2 + sub w, a, w + eor w, w, #2 + mov l, #1 + madd c, a, w, l + mul b, c, c + madd w, c, w, w + mul c, b, b + madd w, b, w, w + mul b, c, c + madd w, c, w, w + madd w, b, w, w + +// Now do one almost-Montgomery reduction w.r.t. the original m +// which lops off one 2^64 from the congruence and, with the usual +// almost-Montgomery correction, gets us back inside k digits for +// the end result. + + ldr b, [t] + mul d, b, w + + mul l, d, a + umulh c, d, a + mov j, #1 + sub a, k, #1 + adds xzr, b, l + cbz a, bignum_modifier_amontend + +bignum_modifier_amontloop: + ldr a, [m, j, lsl #3] + ldr b, [t, j, lsl #3] + mul l, d, a + adcs b, b, c + umulh c, d, a + adc c, c, xzr + adds b, b, l + sub a, j, #1 + str b, [t, a, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_modifier_amontloop +bignum_modifier_amontend: + adcs h, h, c + csetm l, cs + sub a, k, #1 + str h, [t, a, lsl #3] + + subs j, xzr, xzr +bignum_modifier_osloop: + ldr a, [t, j, lsl #3] + ldr b, [m, j, lsl #3] + and b, b, l + sbcs a, a, b + str a, [z, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_modifier_osloop + +// So far, the code (basically the same as bignum_amontifier) has produced +// a k-digit value z == 2^{128k} (mod m), not necessarily fully reduced mod m. +// We now do a short Montgomery reduction (similar to bignum_demont) so that +// we achieve full reduction mod m while lopping 2^{64k} off the congruence. +// We recycle h as the somewhat strangely-named outer loop counter. + + mov h, k + +bignum_modifier_montouterloop: + ldr b, [z] + mul d, b, w + ldr a, [m] + mul l, d, a + umulh c, d, a + mov j, #1 + sub a, k, #1 + adds xzr, b, l + cbz a, bignum_modifier_montend +bignum_modifier_montloop: + ldr a, [m, j, lsl #3] + ldr b, [z, j, lsl #3] + mul l, d, a + adcs b, b, c + umulh c, d, a + adc c, c, xzr + adds b, b, l + sub a, j, #1 + str b, [z, a, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_modifier_montloop +bignum_modifier_montend: + adc c, c, xzr + sub a, k, #1 + str c, [z, a, lsl #3] + + subs h, h, #1 + bne bignum_modifier_montouterloop + +// Now do a comparison of z with m to set a final correction mask +// indicating that z >= m and so we need to subtract m. + + subs j, xzr, xzr +bignum_modifier_cmploop: + ldr a, [z, j, lsl #3] + ldr b, [m, j, lsl #3] + sbcs xzr, a, b + add j, j, #1 + sub a, j, k + cbnz a, bignum_modifier_cmploop + csetm h, cs + +// Now do a masked subtraction of m for the final reduced result. + + subs j, xzr, xzr +bignum_modifier_corrloop: + ldr a, [z, j, lsl #3] + ldr b, [m, j, lsl #3] + and b, b, h + sbcs a, a, b + str a, [z, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_modifier_corrloop + +bignum_modifier_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modinv.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modinv.S new file mode 100644 index 00000000000..b34ab65b382 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modinv.S @@ -0,0 +1,608 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, coprime a +// Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k] +// +// extern void bignum_modinv +// (uint64_t k, uint64_t *z, uint64_t *a, uint64_t *b, uint64_t *t); +// +// k-digit (digit=64 bits) "z := a^-1 mod b" (modular inverse of a modulo b) +// using t as a temporary buffer (t at least 3*k words = 24*k bytes), and +// assuming that a and b are coprime *and* that b is an odd number > 1. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = a, X3 = b, X4 = t +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modinv) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modinv) + .text + .balign 4 + +// We get CHUNKSIZE bits per outer iteration, 64 minus a few for proxy errors + +#define CHUNKSIZE 58 + +// Pervasive variables + +#define k x0 +#define z x1 +#define b x3 +#define w x4 + +// This one is recycled after initial copying in of a as outer loop counter + +#define a x2 +#define t x2 + +// Additional variables; later ones are currently rather high regs + +#define l x5 + +#define m x21 +#define n x22 + +// The matrix of update factors to apply to m and n +// Also used a couple of additional temporary variables for the swapping loop +// Also used as an extra down-counter in corrective negation loops + +#define m_m x6 +#define m_n x7 +#define n_m x8 +#define n_n x9 + +#define j x6 + +// General temporary variables and loop counters + +#define i x10 +#define t1 x11 +#define t2 x12 + +// High and low proxies for the inner loop +// Then re-used for high and carry words during actual cross-multiplications + +#define m_hi x13 +#define n_hi x14 +#define m_lo x15 +#define n_lo x16 + +#define h1 x13 +#define h2 x14 +#define l1 x15 +#define l2 x16 + +#define c1 x17 +#define c2 x19 + +// Negated modular inverse for Montgomery + +#define v x20 + +// Some more intuitive names for temp regs in initial word-level negmodinv. +// These just use t1 and t2 again, though carefully since t1 = initial b[0] + +#define one t2 +#define e1 t2 +#define e2 t1 +#define e4 t2 +#define e8 t1 + +S2N_BN_SYMBOL(bignum_modinv): + +// We make use of registers beyond the modifiable + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + +// If k = 0 then do nothing (this is out of scope anyway) + + cbz k, bignum_modinv_end + +// Set up the additional two buffers m and n beyond w in temp space + + lsl i, k, #3 + add m, w, i + add n, m, i + +// Initialize the main buffers with their starting values: +// m = a, n = b, w = b (to be tweaked to b - 1) and z = 0 + + mov i, xzr +bignum_modinv_copyloop: + ldr t1, [a, i, lsl #3] + ldr t2, [b, i, lsl #3] + str t1, [m, i, lsl #3] + str t2, [n, i, lsl #3] + str t2, [w, i, lsl #3] + str xzr, [z, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_modinv_copyloop + +// Tweak down w to b - 1 (this crude approach is safe as b needs to be odd +// for it to be in scope). We have then established the congruence invariant: +// +// a * w == -m (mod b) +// a * z == n (mod b) +// +// This, with the bound w <= b and z <= b, is maintained round the outer loop + + ldr t1, [w] + sub t2, t1, #1 + str t2, [w] + +// Compute v = negated modular inverse of b mod 2^64, reusing t1 from above +// This is used for Montgomery reduction operations each time round the loop + + lsl v, t1, #2 + sub v, t1, v + eor v, v, #2 + mov one, #1 + madd e1, t1, v, one + mul e2, e1, e1 + madd v, e1, v, v + mul e4, e2, e2 + madd v, e2, v, v + mul e8, e4, e4 + madd v, e4, v, v + madd v, e8, v, v + +// Set up the outer loop count of 128 * k +// The invariant is that m * n < 2^t at all times. + + lsl t, k, #7 + +// Start of the main outer loop iterated t / CHUNKSIZE times + +bignum_modinv_outerloop: + +// We need only bother with sharper l = min k (ceil(t/64)) digits +// for the computations on m and n (but we still need k for w and z). +// Either both m and n fit in l digits, or m has become zero and so +// nothing happens in the loop anyway and this makes no difference. + + add i, t, #63 + lsr l, i, #6 + cmp l, k + csel l, k, l, cs + +// Select upper and lower proxies for both m and n to drive the inner +// loop. The lower proxies are simply the lowest digits themselves, +// m_lo = m[0] and n_lo = n[0], while the upper proxies are bitfields +// of the two inputs selected so their top bit (63) aligns with the +// most significant bit of *either* of the two inputs. + + mov h1, xzr // Previous high and low for m + mov l1, xzr + mov h2, xzr // Previous high and low for n + mov l2, xzr + mov c2, xzr // Mask flag: previous word of one was nonzero + // and in this case h1 and h2 are those words + mov i, xzr +bignum_modinv_toploop: + ldr t1, [m, i, lsl #3] + ldr t2, [n, i, lsl #3] + orr c1, t1, t2 + cmp c1, xzr + and c1, c2, h1 + csel l1, c1, l1, ne + and c1, c2, h2 + csel l2, c1, l2, ne + csel h1, t1, h1, ne + csel h2, t2, h2, ne + csetm c2, ne + add i, i, #1 + cmp i, l + bcc bignum_modinv_toploop + + orr t1, h1, h2 + clz t2, t1 + negs c1, t2 + lsl h1, h1, t2 + csel l1, l1, xzr, ne + lsl h2, h2, t2 + csel l2, l2, xzr, ne + lsr l1, l1, c1 + lsr l2, l2, c1 + orr m_hi, h1, l1 + orr n_hi, h2, l2 + + ldr m_lo, [m] + ldr n_lo, [n] + +// Now the inner loop, with i as loop counter from CHUNKSIZE down. +// This records a matrix of updates to apply to the initial +// values of m and n with, at stage j: +// +// sgn * m' = (m_m * m - m_n * n) / 2^j +// -sgn * n' = (n_m * m - n_n * n) / 2^j +// +// where "sgn" is either +1 or -1, and we lose track of which except +// that both instance above are the same. This throwing away the sign +// costs nothing (since we have to correct in general anyway because +// of the proxied comparison) and makes things a bit simpler. But it +// is simply the parity of the number of times the first condition, +// used as the swapping criterion, fires in this loop. + + mov m_m, #1 + mov m_n, xzr + mov n_m, xzr + mov n_n, #1 + + mov i, #CHUNKSIZE + +// Conceptually in the inner loop we follow these steps: +// +// * If m_lo is odd and m_hi < n_hi, then swap the four pairs +// (m_hi,n_hi); (m_lo,n_lo); (m_m,n_m); (m_n,n_n) +// +// * Now, if m_lo is odd (old or new, doesn't matter as initial n_lo is odd) +// m_hi := m_hi - n_hi, m_lo := m_lo - n_lo +// m_m := m_m + n_m, m_n := m_n + n_n +// +// * Halve and double them +// m_hi := m_hi / 2, m_lo := m_lo / 2 +// n_m := n_m * 2, n_n := n_n * 2 +// +// The actual computation computes updates before actually swapping and +// then corrects as needed. It also maintains the invariant ~ZF <=> odd(m_lo), +// since it seems to reduce the dependent latency. Set that up first. + + ands xzr, m_lo, #1 + +bignum_modinv_innerloop: + +// At the start of the loop ~ZF <=> m_lo is odd; mask values accordingly +// Set the flags for m_hi - [~ZF] * n_hi so we know to flip things. + + csel t1, n_hi, xzr, ne + csel t2, n_lo, xzr, ne + csel c1, n_m, xzr, ne + csel c2, n_n, xzr, ne + ccmp m_hi, n_hi, #0x2, ne + +// Compute subtractive updates, trivial in the case ZF <=> even(m_lo). + + sub t1, m_hi, t1 + sub t2, m_lo, t2 + +// If the subtraction borrows, swap things appropriately, negating where +// we've already subtracted so things are as if we actually swapped first. + + csel n_hi, n_hi, m_hi, cs + cneg t1, t1, cc + csel n_lo, n_lo, m_lo, cs + cneg m_lo, t2, cc + csel n_m, n_m, m_m, cs + csel n_n, n_n, m_n, cs + +// Update and shift while setting oddness flag for next iteration +// We look at bit 1 of t2 (m_lo before possible negation), which is +// safe because it is even. + + ands xzr, t2, #2 + add m_m, m_m, c1 + add m_n, m_n, c2 + lsr m_hi, t1, #1 + lsr m_lo, m_lo, #1 + add n_m, n_m, n_m + add n_n, n_n, n_n + +// Next iteration; don't disturb the flags since they are used at entry + + sub i, i, #1 + cbnz i, bignum_modinv_innerloop + +// Apply the update to w and z, using addition in this case, and also take +// the chance to shift an additional 6 = 64-CHUNKSIZE bits to be ready for a +// Montgomery multiplication. Because we know that m_m + m_n <= 2^58 and +// w, z <= b < 2^{64k}, we know that both of these fit in k+1 words. +// We do this before the m-n update to allow us to play with c1 and c2 here. +// +// h1::w = 2^6 * (m_m * w + m_n * z) +// h2::z = 2^6 * (n_m * w + n_n * z) +// +// with c1 and c2 recording previous words for the shifting part + + mov h1, xzr + mov h2, xzr + mov c1, xzr + mov c2, xzr + + mov i, xzr +bignum_modinv_congloop: + ldr t1, [w, i, lsl #3] + ldr t2, [z, i, lsl #3] + + mul l1, m_m, t1 + mul l2, m_n, t2 + adds l1, l1, h1 + umulh h1, m_m, t1 + adc h1, h1, xzr + adds l1, l1, l2 + extr c1, l1, c1, #CHUNKSIZE + str c1, [w, i, lsl #3] + mov c1, l1 + umulh l1, m_n, t2 + adc h1, h1, l1 + + mul l1, n_m, t1 + mul l2, n_n, t2 + adds l1, l1, h2 + umulh h2, n_m, t1 + adc h2, h2, xzr + adds l1, l1, l2 + extr c2, l1, c2, #CHUNKSIZE + str c2, [z, i, lsl #3] + mov c2, l1 + umulh l1, n_n, t2 + adc h2, h2, l1 + + add i, i, #1 + cmp i, k + bcc bignum_modinv_congloop + + extr h1, h1, c1, #CHUNKSIZE + extr h2, h2, c2, #CHUNKSIZE + +// Do a Montgomery reduction of h1::w + + ldr t1, [w] + mul c1, t1, v + ldr t2, [b] + mul l1, c1, t2 + umulh l2, c1, t2 + adds t1, t1, l1 // Will be zero but want the carry + + mov i, #1 + sub t1, k, #1 + cbz t1, bignum_modinv_wmontend +bignum_modinv_wmontloop: + ldr t1, [b, i, lsl #3] + ldr t2, [w, i, lsl #3] + mul l1, c1, t1 + adcs t2, t2, l2 + umulh l2, c1, t1 + adc l2, l2, xzr + adds t2, t2, l1 + sub l1, i, #1 + str t2, [w, l1, lsl #3] + add i, i, #1 + sub t1, i, k + cbnz t1, bignum_modinv_wmontloop +bignum_modinv_wmontend: + adcs l2, l2, h1 + adc h1, xzr, xzr + sub l1, i, #1 + str l2, [w, l1, lsl #3] + + subs i, xzr, xzr +bignum_modinv_wcmploop: + ldr t1, [w, i, lsl #3] + ldr t2, [b, i, lsl #3] + sbcs xzr, t1, t2 + add i, i, #1 + sub t1, i, k + cbnz t1, bignum_modinv_wcmploop + + sbcs xzr, h1, xzr + csetm h1, cs + + subs i, xzr, xzr +bignum_modinv_wcorrloop: + ldr t1, [w, i, lsl #3] + ldr t2, [b, i, lsl #3] + and t2, t2, h1 + sbcs t1, t1, t2 + str t1, [w, i, lsl #3] + add i, i, #1 + sub t1, i, k + cbnz t1, bignum_modinv_wcorrloop + +// Do a Montgomery reduction of h2::z + + ldr t1, [z] + mul c1, t1, v + ldr t2, [b] + mul l1, c1, t2 + umulh l2, c1, t2 + adds t1, t1, l1 // Will be zero but want the carry + + mov i, #1 + sub t1, k, #1 + cbz t1, bignum_modinv_zmontend +bignum_modinv_zmontloop: + ldr t1, [b, i, lsl #3] + ldr t2, [z, i, lsl #3] + mul l1, c1, t1 + adcs t2, t2, l2 + umulh l2, c1, t1 + adc l2, l2, xzr + adds t2, t2, l1 + sub l1, i, #1 + str t2, [z, l1, lsl #3] + add i, i, #1 + sub t1, i, k + cbnz t1, bignum_modinv_zmontloop +bignum_modinv_zmontend: + adcs l2, l2, h2 + adc h2, xzr, xzr + sub l1, i, #1 + str l2, [z, l1, lsl #3] + + subs i, xzr, xzr +bignum_modinv_zcmploop: + ldr t1, [z, i, lsl #3] + ldr t2, [b, i, lsl #3] + sbcs xzr, t1, t2 + add i, i, #1 + sub t1, i, k + cbnz t1, bignum_modinv_zcmploop + + sbcs xzr, h2, xzr + csetm h2, cs + + subs i, xzr, xzr +bignum_modinv_zcorrloop: + ldr t1, [z, i, lsl #3] + ldr t2, [b, i, lsl #3] + and t2, t2, h2 + sbcs t1, t1, t2 + str t1, [z, i, lsl #3] + add i, i, #1 + sub t1, i, k + cbnz t1, bignum_modinv_zcorrloop + +// Now actually compute the updates to m and n corresponding to the matrix, +// and correct the signs if they have gone negative. First we compute the +// (k+1)-sized updates with the following invariant (here c1 and c2 are in +// fact carry bitmasks, either 0 or -1): +// +// c1::h1::m = m_m * m - m_n * n +// c2::h2::n = n_m * m - n_n * n + + mov h1, xzr + mov h2, xzr + mov c1, xzr + mov c2, xzr + mov i, xzr +bignum_modinv_crossloop: + ldr t1, [m, i, lsl #3] + ldr t2, [n, i, lsl #3] + + mul l1, m_m, t1 + mul l2, m_n, t2 + adds l1, l1, h1 + umulh h1, m_m, t1 + adc h1, h1, xzr + subs l1, l1, l2 + str l1, [m, i, lsl #3] + umulh l1, m_n, t2 + sub c1, l1, c1 + sbcs h1, h1, c1 + csetm c1, cc + + mul l1, n_m, t1 + mul l2, n_n, t2 + adds l1, l1, h2 + umulh h2, n_m, t1 + adc h2, h2, xzr + subs l1, l1, l2 + str l1, [n, i, lsl #3] + umulh l1, n_n, t2 + sub c2, l1, c2 + sbcs h2, h2, c2 + csetm c2, cc + + add i, i, #1 + cmp i, l + bcc bignum_modinv_crossloop + +// Write back m optionally negated and shifted right CHUNKSIZE bits + + adds xzr, c1, c1 + + ldr l1, [m] + mov i, xzr + sub j, l, #1 + cbz j, bignum_modinv_negskip1 + +bignum_modinv_negloop1: + add t1, i, #8 + ldr t2, [m, t1] + extr l1, t2, l1, #CHUNKSIZE + eor l1, l1, c1 + adcs l1, l1, xzr + str l1, [m, i] + mov l1, t2 + add i, i, #8 + sub j, j, #1 + cbnz j, bignum_modinv_negloop1 +bignum_modinv_negskip1: + extr l1, h1, l1, #CHUNKSIZE + eor l1, l1, c1 + adcs l1, l1, xzr + str l1, [m, i] + +// Write back n optionally negated and shifted right CHUNKSIZE bits + + adds xzr, c2, c2 + + ldr l1, [n] + mov i, xzr + sub j, l, #1 + cbz j, bignum_modinv_negskip2 +bignum_modinv_negloop2: + add t1, i, #8 + ldr t2, [n, t1] + extr l1, t2, l1, #CHUNKSIZE + eor l1, l1, c2 + adcs l1, l1, xzr + str l1, [n, i] + mov l1, t2 + add i, i, #8 + sub j, j, #1 + cbnz j, bignum_modinv_negloop2 +bignum_modinv_negskip2: + extr l1, h2, l1, #CHUNKSIZE + eor l1, l1, c2 + adcs l1, l1, xzr + str l1, [n, i] + +// Finally, use the signs c1 and c2 to do optional modular negations of +// w and z respectively, flipping c2 to make signs work. We don't make +// any checks for zero values, but we certainly retain w <= b and z <= b. +// This is enough for the Montgomery step in the next iteration to give +// strict reduction w < b amd z < b, and anyway when we terminate we +// could not have z = b since it violates the coprimality assumption for +// in-scope cases. + + mov i, xzr + adds xzr, c1, c1 +bignum_modinv_wfliploop: + ldr t1, [b, i, lsl #3] + ldr t2, [w, i, lsl #3] + and t1, t1, c1 + eor t2, t2, c1 + adcs t1, t1, t2 + str t1, [w, i, lsl #3] + add i, i, #1 + sub t1, i, k + cbnz t1, bignum_modinv_wfliploop + + mvn c2, c2 + + mov i, xzr + adds xzr, c2, c2 +bignum_modinv_zfliploop: + ldr t1, [b, i, lsl #3] + ldr t2, [z, i, lsl #3] + and t1, t1, c2 + eor t2, t2, c2 + adcs t1, t1, t2 + str t1, [z, i, lsl #3] + add i, i, #1 + sub t1, i, k + cbnz t1, bignum_modinv_zfliploop + +// End of main loop. We can stop if t' <= 0 since then m * n < 2^0, which +// since n is odd and m and n are coprime (in the in-scope cases) means +// m = 0, n = 1 and hence from the congruence invariant a * z == 1 (mod b). +// Moreover we do in fact need to maintain strictly t > 0 in the main loop, +// or the computation of the optimized digit bound l could collapse to 0. + + subs t, t, #CHUNKSIZE + bhi bignum_modinv_outerloop + +bignum_modinv_end: + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modoptneg.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modoptneg.S new file mode 100644 index 00000000000..2f383f1a29e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modoptneg.S @@ -0,0 +1,78 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x +// (if p zero), assuming x reduced +// Inputs p, x[k], m[k]; output z[k] +// +// extern void bignum_modoptneg +// (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x, uint64_t *m); +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = p, X3 = x, X4 = m +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modoptneg) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modoptneg) + .text + .balign 4 + +#define k x0 +#define z x1 +#define p x2 +#define x x3 +#define m x4 +#define i x5 +#define a x6 +#define b x7 + + +S2N_BN_SYMBOL(bignum_modoptneg): + +// Do nothing if k = 0 + + cbz k, bignum_modoptneg_end + +// Make an additional check for zero input, and force p to zero in this case. +// This can be skipped if the input is known not to be zero a priori. + + mov i, xzr + mov a, xzr +bignum_modoptneg_cmploop: + ldr b, [x, i, lsl #3] + orr a, a, b + add i, i, #1 + cmp i, k + bcc bignum_modoptneg_cmploop + + cmp a, xzr + csel p, p, xzr, ne + +// Turn the input p into a strict bitmask + + cmp p, xzr + csetm p, ne + +// Main loop + + mov i, xzr + adds xzr, p, p +bignum_modoptneg_mainloop: + + ldr a, [m, i, lsl #3] + ldr b, [x, i, lsl #3] + and a, a, p + eor b, b, p + adcs a, a, b + str a, [z, i, lsl #3] + add i, i, #1 + sub a, i, k + cbnz a, bignum_modoptneg_mainloop + +bignum_modoptneg_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modsub.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modsub.S new file mode 100644 index 00000000000..0af361be4fb --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_modsub.S @@ -0,0 +1,69 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced +// Inputs x[k], y[k], m[k]; output z[k] +// +// extern void bignum_modsub +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = y, X4 = m +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modsub) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modsub) + .text + .balign 4 + +#define k x0 +#define z x1 +#define x x2 +#define y x3 +#define m x4 +#define i x5 +#define j x6 +#define a x7 +#define b x8 +#define c x9 + + +S2N_BN_SYMBOL(bignum_modsub): + +adds j, k, xzr // j = k and ZF = (k = 0) + beq bignum_modsub_end // if k = 0 do nothing + subs i, xzr, xzr // i = 0 and CF = 1 + +// Subtract z := x - y and record a mask for the carry x - y < 0 + +bignum_modsub_subloop: + ldr a, [x, i] + ldr b, [y, i] + sbcs a, a, b + str a, [z, i] + add i, i, #8 + sub j, j, #1 + cbnz j, bignum_modsub_subloop + csetm c, cc + +// Now do a masked addition z := z + [c] * m + + mov j, k + adds i, xzr, xzr +bignum_modsub_addloop: + ldr a, [z, i] + ldr b, [m, i] + and b, b, c + adcs a, a, b + str a, [z, i] + add i, i, #8 + sub j, j, #1 + cbnz j, bignum_modsub_addloop + +bignum_modsub_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montifier.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montifier.S new file mode 100644 index 00000000000..b82a65013c3 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montifier.S @@ -0,0 +1,457 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compute "montification" constant z := 2^{128k} mod m +// Input m[k]; output z[k]; temporary buffer t[>=k] +// +// extern void bignum_montifier +// (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t); +// +// The last argument points to a temporary buffer t that should have size >= k. +// This is called "montifier" because given any other k-digit number x, +// whether or not it's reduced modulo m, it can be mapped to its Montgomery +// representation (2^{64k} * x) mod m just by Montgomery multiplication by z. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = m, X3 = t +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montifier) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montifier) + .text + .balign 4 + +#define k x0 +#define z x1 +#define m x2 +#define t x3 + +// Some variables +// Modular inverse w is aliased to i, but we never use them together + +#define i x4 +#define w x4 +#define j x5 +#define h x6 +#define a x7 +#define l x8 +#define c x9 +#define b x10 +#define d x11 + +// Some aliases for the values b and d + +#define r x10 +#define q x11 + + +S2N_BN_SYMBOL(bignum_montifier): + +// If k = 0 the whole operation is trivial + + cbz k, bignum_montifier_end + +// Copy the input m into the temporary buffer t. The temporary register +// c matters since we want it to hold the highest digit, ready for the +// normalization phase. + + mov i, xzr +bignum_montifier_copyinloop: + ldr c, [m, i, lsl #3] + str c, [t, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_montifier_copyinloop + +// Do a rather stupid but constant-time digit normalization, conditionally +// shifting left (k-1) times based on whether the top word is zero. +// With careful binary striding this could be O(k*log(k)) instead of O(k^2) +// while still retaining the constant-time style. +// The "cmp c, xzr" sets the zeroness predicate (ZF) for the entire inner loop + + subs i, k, #1 + beq bignum_montifier_normalized +bignum_montifier_normloop: + mov j, xzr + cmp c, xzr + mov a, xzr +bignum_montifier_shufloop: + mov c, a + ldr a, [t, j, lsl #3] + csel c, c, a, eq + str c, [t, j, lsl #3] + add j, j, #1 + sub d, j, k + cbnz d, bignum_montifier_shufloop + subs i, i, #1 + bne bignum_montifier_normloop + +// We now have the top digit nonzero, assuming the input was nonzero, +// and as per the invariant of the loop above, c holds that digit. So +// now just count c's leading zeros and shift t bitwise that many bits. + +bignum_montifier_normalized: + clz c, c + + mov b, xzr + mov i, xzr + ands xzr, c, #63 + csetm l, ne + neg d, c +bignum_montifier_bitloop: + ldr j, [t, i, lsl #3] + lsl a, j, c + orr a, a, b + lsr b, j, d + and b, b, l + str a, [t, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_montifier_bitloop + +// Let h be the high word of n, which in all the in-scope cases is >= 2^63. +// Now successively form q = 2^i div h and r = 2^i mod h as i goes from +// 64 to 126. We avoid just using division out of constant-time concerns +// (at the least we would need to fix up h = 0 for out-of-scope inputs) and +// don't bother with Newton-Raphson, since this stupid simple loop doesn't +// contribute much of the overall runtime at typical sizes. + + sub h, k, #1 + ldr h, [t, h, lsl #3] + mov q, #1 + neg r, h + mov i, #62 +bignum_montifier_estloop: + add q, q, q + mov a, h + sub a, a, r + cmp r, a // CF <=> r >= h - r <=> 2 * r >= h + csetm a, cs + sub q, q, a + add r, r, r + and a, a, h + sub r, r, a + subs i, i, #1 + bne bignum_montifier_estloop + +// Strictly speaking the above loop doesn't quite give the true remainder +// and quotient in the special case r = h = 2^63, so fix it up. We get +// q = 2^63 - 1 and r = 2^63 and really want q = 2^63 and r = 0. This is +// supererogatory, because the main property of q used below still holds +// in this case unless the initial m = 1, and then anyway the overall +// specification (congruence modulo m) holds degenerately. But it seems +// nicer to get a "true" quotient and remainder. + + cmp r, h + csinc q, q, q, ne + +// So now we have q and r with 2^126 = q * h + r (imagining r = 0 in the +// fixed-up case above: note that we never actually use the computed +// value of r below and so didn't adjust it). And we can assume the ranges +// q <= 2^63 and r < h < 2^64. +// +// The idea is to use q as a first quotient estimate for a remainder +// of 2^{p+62} mod n, where p = 64 * k. We have, splitting n into the +// high and low parts h and l: +// +// 2^{p+62} - q * n = 2^{p+62} - q * (2^{p-64} * h + l) +// = 2^{p+62} - (2^{p-64} * (q * h) + q * l) +// = 2^{p+62} - 2^{p-64} * (2^126 - r) - q * l +// = 2^{p-64} * r - q * l +// +// Note that 2^{p-64} * r < 2^{p-64} * h <= n +// and also q * l < 2^63 * 2^{p-64} = 2^{p-1} <= n +// so |diff| = |2^{p-64} * r - q * l| < n. +// +// If in fact diff >= 0 then it is already 2^{p+62} mod n. +// otherwise diff + n is the right answer. +// +// To (maybe?) make the computation slightly easier we actually flip +// the sign and compute d = q * n - 2^{p+62}. Then the answer is either +// -d (when negative) or n - d; in either case we effectively negate d. +// This negating tweak in fact spoils the result for cases where +// 2^{p+62} mod n = 0, when we get n instead. However the only case +// where this can happen is m = 1, when the whole spec holds trivially, +// and actually the remainder of the logic below works anyway since +// the latter part of the code only needs a congruence for the k-digit +// result, not strict modular reduction (the doublings will maintain +// the non-strict inequality). + + mov c, xzr + adds i, xzr, xzr +bignum_montifier_mulloop: + ldr a, [t, i, lsl #3] + mul l, q, a + adcs l, l, c + umulh c, q, a + str l, [z, i, lsl #3] + add i, i, #1 + sub a, i, k + cbnz a, bignum_montifier_mulloop + + adc c, c, xzr + mov a, #0x4000000000000000 + subs c, c, a + csetm q, cs + +// Now do [c] * n - d for our final answer + + subs i, xzr, xzr +bignum_montifier_remloop: + ldr a, [t, i, lsl #3] + ldr b, [z, i, lsl #3] + and a, a, q + sbcs a, a, b + str a, [z, i, lsl #3] + add i, i, #1 + sub a, i, k + cbnz a, bignum_montifier_remloop + +// Now still need to do a couple of modular doublings to get us all the +// way up to 2^{p+64} == r from the initial 2^{p+62} == r (mod n). + + mov c, xzr + subs j, xzr, xzr +bignum_montifier_dubloop1: + ldr a, [z, j, lsl #3] + extr c, a, c, #63 + ldr b, [t, j, lsl #3] + sbcs c, c, b + str c, [z, j, lsl #3] + mov c, a + add j, j, #1 + sub a, j, k + cbnz a, bignum_montifier_dubloop1 + lsr c, c, #63 + sbc c, c, xzr + adds j, xzr, xzr +bignum_montifier_corrloop1: + ldr a, [z, j, lsl #3] + ldr b, [t, j, lsl #3] + and b, b, c + adcs a, a, b + str a, [z, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_montifier_corrloop1 + +// This is not exactly the same: we also copy output to t giving the +// initialization t_1 = r == 2^{p+64} mod n for the main loop next. + + mov c, xzr + subs j, xzr, xzr +bignum_montifier_dubloop2: + ldr a, [z, j, lsl #3] + extr c, a, c, #63 + ldr b, [t, j, lsl #3] + sbcs c, c, b + str c, [z, j, lsl #3] + mov c, a + add j, j, #1 + sub a, j, k + cbnz a, bignum_montifier_dubloop2 + lsr c, c, #63 + sbc c, c, xzr + adds j, xzr, xzr +bignum_montifier_corrloop2: + ldr a, [z, j, lsl #3] + ldr b, [t, j, lsl #3] + and b, b, c + adcs a, a, b + str a, [z, j, lsl #3] + str a, [t, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_montifier_corrloop2 + +// We then successively generate (k+1)-digit values satisfying +// t_i == 2^{p+64*i} mod n, each of which is stored in h::t. Finish +// initialization by zeroing h initially + + mov h, xzr + +// Then if t_i = 2^{p} * h + l +// we have t_{i+1} == 2^64 * t_i +// = (2^{p+64} * h) + (2^64 * l) +// == r * h + l<<64 +// Do this 2*k more times so we end up == 2^{192*k+64}, one more than we want +// +// Writing B = 2^{64k}, the possible correction of adding r, which for +// a (k+1)-digit result is equivalent to subtracting q = 2^{64*(k+1)} - r +// would give the overall worst-case value minus q of +// [ B * (B^k - 1) + (B - 1) * r ] - [B^{k+1} - r] +// = B * (r - 1) < B^{k+1} so we keep inside k+1 digits as required. +// +// This implementation makes the shift implicit by starting b with the +// "previous" digit (initially 0) to offset things by 1. + + add i, k, k +bignum_montifier_modloop: + mov j, xzr + mov b, xzr + adds c, xzr, xzr +bignum_montifier_cmaloop: + ldr a, [z, j, lsl #3] + mul l, h, a + adcs b, b, c + umulh c, h, a + adc c, c, xzr + adds l, b, l + ldr b, [t, j, lsl #3] + str l, [t, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_montifier_cmaloop + + adcs h, b, c + + csetm l, cs + + adds j, xzr, xzr +bignum_montifier_oaloop: + ldr a, [t, j, lsl #3] + ldr b, [z, j, lsl #3] + and b, b, l + adcs a, a, b + str a, [t, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_montifier_oaloop + adc h, h, xzr + + subs i, i, #1 + bne bignum_montifier_modloop + +// Compute the negated modular inverse w (same register as i, not used again). + + ldr a, [m] + lsl w, a, #2 + sub w, a, w + eor w, w, #2 + mov l, #1 + madd c, a, w, l + mul b, c, c + madd w, c, w, w + mul c, b, b + madd w, b, w, w + mul b, c, c + madd w, c, w, w + madd w, b, w, w + +// Now do one almost-Montgomery reduction w.r.t. the original m +// which lops off one 2^64 from the congruence and, with the usual +// almost-Montgomery correction, gets us back inside k digits for +// the end result. + + ldr b, [t] + mul d, b, w + + mul l, d, a + umulh c, d, a + mov j, #1 + sub a, k, #1 + adds xzr, b, l + cbz a, bignum_montifier_amontend + +bignum_montifier_amontloop: + ldr a, [m, j, lsl #3] + ldr b, [t, j, lsl #3] + mul l, d, a + adcs b, b, c + umulh c, d, a + adc c, c, xzr + adds b, b, l + sub a, j, #1 + str b, [t, a, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_montifier_amontloop +bignum_montifier_amontend: + adcs h, h, c + csetm l, cs + sub a, k, #1 + str h, [t, a, lsl #3] + + subs j, xzr, xzr +bignum_montifier_osloop: + ldr a, [t, j, lsl #3] + ldr b, [m, j, lsl #3] + and b, b, l + sbcs a, a, b + str a, [z, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_montifier_osloop + +// So far, the code(basically a variant of bignum_amontifier) has produced +// a k-digit value z == 2^{192k} (mod m), not necessarily fully reduced mod m. +// We now do a short Montgomery reduction (similar to bignum_demont) so that +// we achieve full reduction mod m while lopping 2^{64k} off the congruence. +// We recycle h as the somewhat strangely-named outer loop counter. + + mov h, k + +bignum_montifier_montouterloop: + ldr b, [z] + mul d, b, w + ldr a, [m] + mul l, d, a + umulh c, d, a + mov j, #1 + sub a, k, #1 + adds xzr, b, l + cbz a, bignum_montifier_montend +bignum_montifier_montloop: + ldr a, [m, j, lsl #3] + ldr b, [z, j, lsl #3] + mul l, d, a + adcs b, b, c + umulh c, d, a + adc c, c, xzr + adds b, b, l + sub a, j, #1 + str b, [z, a, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_montifier_montloop +bignum_montifier_montend: + adc c, c, xzr + sub a, k, #1 + str c, [z, a, lsl #3] + + subs h, h, #1 + bne bignum_montifier_montouterloop + +// Now do a comparison of z with m to set a final correction mask +// indicating that z >= m and so we need to subtract m. + + subs j, xzr, xzr +bignum_montifier_cmploop: + ldr a, [z, j, lsl #3] + ldr b, [m, j, lsl #3] + sbcs xzr, a, b + add j, j, #1 + sub a, j, k + cbnz a, bignum_montifier_cmploop + csetm h, cs + +// Now do a masked subtraction of m for the final reduced result. + + subs j, xzr, xzr +bignum_montifier_corrloop: + ldr a, [z, j, lsl #3] + ldr b, [m, j, lsl #3] + and b, b, h + sbcs a, a, b + str a, [z, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_montifier_corrloop + +bignum_montifier_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montmul.S new file mode 100644 index 00000000000..672885189b2 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montmul.S @@ -0,0 +1,193 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^{64k}) mod m +// Inputs x[k], y[k], m[k]; output z[k] +// +// extern void bignum_montmul +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); +// +// Does z := (x * y / 2^{64k}) mod m, assuming x * y <= 2^{64k} * m, which is +// guaranteed in particular if x < m, y < m initially (the "intended" case). +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = y, X4 = m +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul) + .text + .balign 4 + +#define k x0 +#define z x1 +#define x x2 +#define y x3 +#define m x4 + +// Negated modular inverse +#define w x5 +// Top carry for k'th position +#define c0 x6 +// Additional top carry for (k+1)'th position +#define c1 x7 +// Outer loop counter +#define i x8 +// Home for i'th digit or Montgomery multiplier +#define d x9 +// Inner loop counter +#define j x10 + +#define h x11 +#define e x12 +#define l x13 +#define a x14 + +// This is just a short-term temporary used in zero-test subtraction. +// It's aliased to the same register as "a" which is always safe here. + +#define t x14 + +// Some more intuitive names for temp regs in initial word-level negmodinv. +// These just use c0 and c1 again, which aren't initialized early on. + +#define one x6 +#define e1 x6 +#define e2 x7 +#define e4 x6 +#define e8 x7 + + +S2N_BN_SYMBOL(bignum_montmul): + +// If k = 0 the whole operation is trivial + + cbz k, bignum_montmul_end + +// Compute word-level negated modular inverse w for m == m[0]. +// This is essentially the same as word_negmodinv. + + ldr a, [m] + lsl w, a, #2 + sub w, a, w + eor w, w, #2 + mov one, #1 + madd e1, a, w, one + mul e2, e1, e1 + madd w, e1, w, w + mul e4, e2, e2 + madd w, e2, w, w + mul e8, e4, e4 + madd w, e4, w, w + madd w, e8, w, w + +// Initialize the output c0::z to zero so we can then consistently add rows. +// It would be a bit more efficient to special-case the zeroth row, but +// this keeps the code slightly simpler. + + mov i, xzr +bignum_montmul_zoop: + str xzr, [z, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_montmul_zoop + mov c0, xzr + +// Outer loop pulling down digits d=x[i], multiplying by y and reducing + + mov i, xzr +bignum_montmul_outerloop: + +// Multiply-add loop where we always have CF + previous high part h to add in +// Note that in general we do need yet one more carry in this phase and hence +// initialize c1 with the top carry. + + ldr d, [x, i, lsl #3] + mov j, xzr + adds h, xzr, xzr +bignum_montmul_maddloop: + ldr a, [y, j, lsl #3] + ldr e, [z, j, lsl #3] + mul l, d, a + adcs e, e, h + umulh h, d, a + adc h, h, xzr + adds e, e, l + str e, [z, j, lsl #3] + add j, j, #1 + sub t, j, k + cbnz t, bignum_montmul_maddloop + adcs c0, c0, h + adc c1, xzr, xzr + +// Montgomery reduction loop, similar but offsetting writebacks + + ldr e, [z] + mul d, e, w + ldr a, [m] + mul l, d, a + umulh h, d, a + adds e, e, l // Will be zero but want the carry + mov j, #1 + sub t, k, #1 + cbz t, bignum_montmul_montend +bignum_montmul_montloop: + ldr a, [m, j, lsl #3] + ldr e, [z, j, lsl #3] + mul l, d, a + adcs e, e, h + umulh h, d, a + adc h, h, xzr + adds e, e, l + sub l, j, #1 + str e, [z, l, lsl #3] + add j, j, #1 + sub t, j, k + cbnz t, bignum_montmul_montloop +bignum_montmul_montend: + adcs h, c0, h + adc c0, c1, xzr + sub l, j, #1 + str h, [z, l, lsl #3] + +// End of outer loop + + add i, i, #1 + cmp i, k + bcc bignum_montmul_outerloop + +// Now do a comparison of (c0::z) with (0::m) to set a final correction mask +// indicating that (c0::z) >= m and so we need to subtract m. + + subs j, xzr, xzr +bignum_montmul_cmploop: + ldr a, [z, j, lsl #3] + ldr e, [m, j, lsl #3] + sbcs xzr, a, e + add j, j, #1 + sub t, j, k + cbnz t, bignum_montmul_cmploop + + sbcs xzr, c0, xzr + csetm c0, cs + +// Now do a masked subtraction of m for the final reduced result. + + subs j, xzr, xzr +bignum_montmul_corrloop: + ldr a, [z, j, lsl #3] + ldr e, [m, j, lsl #3] + and e, e, c0 + sbcs a, a, e + str a, [z, j, lsl #3] + add j, j, #1 + sub t, j, k + cbnz t, bignum_montmul_corrloop + +bignum_montmul_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montredc.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montredc.S new file mode 100644 index 00000000000..5b19e7a6acd --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montredc.S @@ -0,0 +1,194 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery reduce, z := (x' / 2^{64p}) MOD m +// Inputs x[n], m[k], p; output z[k] +// +// extern void bignum_montredc +// (uint64_t k, uint64_t *z, +// uint64_t n, uint64_t *x, uint64_t *m, uint64_t p); +// +// Does a := (x' / 2^{64p}) mod m where x' = x if n <= p + k and in general +// is the lowest (p+k) digits of x, assuming x' <= 2^{64p} * m. That is, +// p-fold Montgomery reduction w.r.t. a k-digit modulus m giving a k-digit +// answer. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x, X4 = m, X5 = p +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montredc) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montredc) + .text + .balign 4 + +#define k x0 +#define z x1 +#define n x2 +#define x x3 +#define m x4 +#define p x5 + +// Negated modular inverse +#define w x6 +// Outer loop counter +#define i x7 +// Inner loop counter +#define j x8 +// Home for Montgomery multiplier +#define d x9 +// Top carry for current window +#define c x14 + +#define h x10 +#define e x11 +#define l x12 +#define a x13 + +// Some more intuitive names for temp regs in initial word-level negmodinv. +// These just use i and j again, which aren't used early on. + +#define one x7 +#define e1 x7 +#define e2 x8 +#define e4 x7 +#define e8 x8 + + +S2N_BN_SYMBOL(bignum_montredc): + +// If k = 0 the whole operation is trivial + + cbz k, bignum_montredc_end + +// Compute word-level negated modular inverse w for m == m[0]. +// This is essentially the same as word_negmodinv. + + ldr a, [m] + lsl w, a, #2 + sub w, a, w + eor w, w, #2 + mov one, #1 + madd e1, a, w, one + mul e2, e1, e1 + madd w, e1, w, w + mul e4, e2, e2 + madd w, e2, w, w + mul e8, e4, e4 + madd w, e4, w, w + madd w, e8, w, w + +// Initialize z to the lowest k digits of the input, zero-padding if n < k. + + cmp n, k + csel j, k, n, cs + mov i, xzr + cbz j, bignum_montredc_padloop +bignum_montredc_copyloop: + ldr a, [x, i, lsl #3] + str a, [z, i, lsl #3] + add i, i, #1 + cmp i, j + bcc bignum_montredc_copyloop + + cmp i, k + bcs bignum_montredc_initialized + +bignum_montredc_padloop: + str xzr, [z, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_montredc_padloop + +bignum_montredc_initialized: + mov c, xzr + +// Now if p = 0 we just need the corrective tail, and even that is +// only needed for the case when the input is exactly the modulus, +// to maintain the <= 2^64p * n precondition + + cbz p, bignum_montredc_corrective + +// Outer loop, just doing a standard Montgomery reduction on z + + mov i, xzr +bignum_montredc_outerloop: + + ldr e, [z] + mul d, e, w + ldr a, [m] + mul l, d, a + umulh h, d, a + adds e, e, l // Will be zero but want the carry + mov j, #1 + sub a, k, #1 + cbz a, bignum_montredc_montend +bignum_montredc_montloop: + ldr a, [m, j, lsl #3] + ldr e, [z, j, lsl #3] + mul l, d, a + adcs e, e, h + umulh h, d, a + adc h, h, xzr + adds e, e, l + sub l, j, #1 + str e, [z, l, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_montredc_montloop +bignum_montredc_montend: + adcs h, h, c + adc c, xzr, xzr + add j, j, i + cmp j, n + bcs bignum_montredc_offtheend + ldr a, [x, j, lsl #3] + adds h, h, a + adc c, c, xzr +bignum_montredc_offtheend: + sub j, k, #1 + str h, [z, j, lsl #3] + +// End of outer loop + + add i, i, #1 + cmp i, p + bcc bignum_montredc_outerloop + +// Now do a comparison of (c::z) with (0::m) to set a final correction mask +// indicating that (c::z) >= m and so we need to subtract m. + +bignum_montredc_corrective: + + subs j, xzr, xzr +bignum_montredc_cmploop: + ldr a, [z, j, lsl #3] + ldr e, [m, j, lsl #3] + sbcs xzr, a, e + add j, j, #1 + sub a, j, k + cbnz a, bignum_montredc_cmploop + + sbcs xzr, c, xzr + csetm c, cs + +// Now do a masked subtraction of m for the final reduced result. + + subs j, xzr, xzr +bignum_montredc_corrloop: + ldr a, [z, j, lsl #3] + ldr e, [m, j, lsl #3] + and e, e, c + sbcs a, a, e + str a, [z, j, lsl #3] + add j, j, #1 + sub a, j, k + cbnz a, bignum_montredc_corrloop + +bignum_montredc_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montsqr.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montsqr.S new file mode 100644 index 00000000000..a7824964a83 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_montsqr.S @@ -0,0 +1,192 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^{64k}) mod m +// Inputs x[k], m[k]; output z[k] +// +// extern void bignum_montsqr +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); +// +// Does z := (x^2 / 2^{64k}) mod m, assuming x^2 <= 2^{64k} * m, which is +// guaranteed in particular if x < m initially (the "intended" case). +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = m +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr) + .text + .balign 4 + +#define k x0 +#define z x1 +#define x x2 +#define m x3 + +// Negated modular inverse +#define w x4 +// Top carry for k'th position +#define c0 x5 +// Additional top carry for (k+1)'th position +#define c1 x6 +// Outer loop counter +#define i x7 +// Home for i'th digit or Montgomery multiplier +#define d x8 +// Inner loop counter +#define j x9 + +#define h x10 +#define e x11 +#define l x12 +#define a x13 + +// This is just a short-term temporary used in zero-test subtraction. +// It's aliased to the same register as "a" which is always safe here. + +#define t x13 + +// Some more intuitive names for temp regs in initial word-level negmodinv. +// These just use c0 and c1 again, which aren't initialized early on. + +#define one x5 +#define e1 x5 +#define e2 x6 +#define e4 x5 +#define e8 x6 + + +S2N_BN_SYMBOL(bignum_montsqr): + +// If k = 0 the whole operation is trivial + + cbz k, bignum_montsqr_end + +// Compute word-level negated modular inverse w for m == m[0]. +// This is essentially the same as word_negmodinv. + + ldr a, [m] + lsl w, a, #2 + sub w, a, w + eor w, w, #2 + mov one, #1 + madd e1, a, w, one + mul e2, e1, e1 + madd w, e1, w, w + mul e4, e2, e2 + madd w, e2, w, w + mul e8, e4, e4 + madd w, e4, w, w + madd w, e8, w, w + +// Initialize the output c0::z to zero so we can then consistently add rows. +// It would be a bit more efficient to special-case the zeroth row, but +// this keeps the code slightly simpler. + + mov i, xzr +bignum_montsqr_zoop: + str xzr, [z, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_montsqr_zoop + mov c0, xzr + +// Outer loop pulling down digits d=x[i], multiplying by x and reducing + + mov i, xzr +bignum_montsqr_outerloop: + +// Multiply-add loop where we always have CF + previous high part h to add in +// Note that in general we do need yet one more carry in this phase and hence +// initialize c1 with the top carry. + + ldr d, [x, i, lsl #3] + mov j, xzr + adds h, xzr, xzr +bignum_montsqr_maddloop: + ldr a, [x, j, lsl #3] + ldr e, [z, j, lsl #3] + mul l, d, a + adcs e, e, h + umulh h, d, a + adc h, h, xzr + adds e, e, l + str e, [z, j, lsl #3] + add j, j, #1 + sub t, j, k + cbnz t, bignum_montsqr_maddloop + adcs c0, c0, h + adc c1, xzr, xzr + +// Montgomery reduction loop, similar but offsetting writebacks + + ldr e, [z] + mul d, e, w + ldr a, [m] + mul l, d, a + umulh h, d, a + adds e, e, l // Will be zero but want the carry + mov j, #1 + sub t, k, #1 + cbz t, bignum_montsqr_montend +bignum_montsqr_montloop: + ldr a, [m, j, lsl #3] + ldr e, [z, j, lsl #3] + mul l, d, a + adcs e, e, h + umulh h, d, a + adc h, h, xzr + adds e, e, l + sub l, j, #1 + str e, [z, l, lsl #3] + add j, j, #1 + sub t, j, k + cbnz t, bignum_montsqr_montloop +bignum_montsqr_montend: + adcs h, c0, h + adc c0, c1, xzr + sub l, j, #1 + str h, [z, l, lsl #3] + +// End of outer loop + + add i, i, #1 + cmp i, k + bcc bignum_montsqr_outerloop + +// Now do a comparison of (c0::z) with (0::m) to set a final correction mask +// indicating that (c0::z) >= m and so we need to subtract m. + + subs j, xzr, xzr +bignum_montsqr_cmploop: + ldr a, [z, j, lsl #3] + ldr e, [m, j, lsl #3] + sbcs xzr, a, e + add j, j, #1 + sub t, j, k + cbnz t, bignum_montsqr_cmploop + + sbcs xzr, c0, xzr + csetm c0, cs + +// Now do a masked subtraction of m for the final reduced result. + + subs j, xzr, xzr +bignum_montsqr_corrloop: + ldr a, [z, j, lsl #3] + ldr e, [m, j, lsl #3] + and e, e, c0 + sbcs a, a, e + str a, [z, j, lsl #3] + add j, j, #1 + sub t, j, k + cbnz t, bignum_montsqr_corrloop + +bignum_montsqr_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/generic/bignum_mul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mul.S similarity index 100% rename from third_party/s2n-bignum/arm/generic/bignum_mul.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mul.S diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_muladd10.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_muladd10.S new file mode 100644 index 00000000000..a1deef001a5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_muladd10.S @@ -0,0 +1,62 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply bignum by 10 and add word: z := 10 * z + d +// Inputs z[k], d; outputs function return (carry) and z[k] +// +// extern uint64_t bignum_muladd10 (uint64_t k, uint64_t *z, uint64_t d); +// +// Although typically the input d < 10, this is not actually required. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = d, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_muladd10) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_muladd10) + .text + .balign 4 + +#define k x0 +#define z x1 +#define d x2 + +#define i x3 +#define h x4 + +#define a x5 +#define l x5 + +S2N_BN_SYMBOL(bignum_muladd10): + +// If k = 0 just return the input d as the carry (out of zero digits) + + cbz k, bignum_muladd10_end + +// Simple loop + + mov i, xzr +bignum_muladd10_loop: + ldr a, [z, i, lsl #3] + lsr h, a, #61 + add l, a, a + add h, h, h, lsr #2 + adds l, l, l, lsl #2 + adc h, h, xzr + adds a, l, d + str a, [z, i, lsl #3] + adc d, h, xzr + add i, i, 1 + cmp i, k + bcc bignum_muladd10_loop + +// Return the final carry + +bignum_muladd10_end: + mov x0, d + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mux.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mux.S new file mode 100644 index 00000000000..1b45f9fa200 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mux.S @@ -0,0 +1,50 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiplex/select z := x (if p nonzero) or z := y (if p zero) +// Inputs p, x[k], y[k]; output z[k] +// +// extern void bignum_mux +// (uint64_t p, uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y); +// +// It is assumed that all numbers x, y and z have the same size k digits. +// +// Standard ARM ABI: X0 = p, X1 = k, X2 = z, X3 = x, X4 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mux) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mux) + .text + .balign 4 + +#define b x0 +#define k x1 +#define z x2 +#define x x3 +#define y x4 +#define a x5 + + +S2N_BN_SYMBOL(bignum_mux): + +cbz k, bignum_mux_end // if k = 0 skip the bignum_mux_loop + cmp b, #0 // Set condition codes b = 0 + +// We've set cc's from b once and for all and can now re-use "b" as a temporary + +bignum_mux_loop: + sub k, k, #1 + ldr a, [x, k, lsl #3] + ldr b, [y, k, lsl #3] + csel a, a, b, ne + str a, [z, k, lsl #3] + cbnz k, bignum_mux_loop + +bignum_mux_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mux16.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mux16.S new file mode 100644 index 00000000000..c3999111346 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_mux16.S @@ -0,0 +1,67 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Select element from 16-element table, z := xs[k*i] +// Inputs xs[16*k], i; output z[k] +// +// extern void bignum_mux16 +// (uint64_t k, uint64_t *z, uint64_t *xs, uint64_t i); +// +// It is assumed that all numbers xs[16] and the target z have the same size k +// The pointer xs is to a contiguous array of size 16, elements size-k bignums +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = xs, X3 = i +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mux16) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mux16) + .text + .balign 4 + +#define k x0 +#define z x1 +#define x x2 +#define i x3 + +#define a x4 +#define b x5 +#define j x6 +#define n x7 + + +S2N_BN_SYMBOL(bignum_mux16): + +// Copy size into decrementable counter, skip everything if k = 0 + + adds n, k, xzr + beq bignum_mux16_end + +// Multiply i by k so we can compare pointer offsets directly with it + + mul i, i, k + +bignum_mux16_loop: + + ldr a, [x] + mov j, k + .rep 15 + ldr b, [x, j, lsl #3] + cmp j, i + csel a, b, a, eq + add j, j, k + .endr + str a, [z] + + add z, z, #8 + add x, x, #8 + subs n, n, #1 + bne bignum_mux16_loop + +bignum_mux16_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_negmodinv.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_negmodinv.S new file mode 100644 index 00000000000..4772a3512db --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_negmodinv.S @@ -0,0 +1,135 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Negated modular inverse, z := (-1/x) mod 2^{64k} +// Input x[k]; output z[k] +// +// extern void bignum_negmodinv +// (uint64_t k, uint64_t *z, uint64_t *x); +// +// Assuming x is odd (otherwise nothing makes sense) the result satisfies +// +// x * z + 1 == 0 (mod 2^{64 * k}) +// +// but is not necessarily reduced mod x. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_negmodinv) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_negmodinv) + .text + .balign 4 + +#define k x0 +#define z x1 +#define x x2 + +#define w x3 +#define a x4 +#define m x5 +#define h x6 +#define l x7 +#define e x8 +#define i x9 + + + +S2N_BN_SYMBOL(bignum_negmodinv): + +// If k = 0 do nothing + + cbz k, bignum_negmodinv_end + +// Compute word-level negated modular inverse w for x[0]. + + ldr a, [x] + lsl w, a, #2 + sub w, a, w + eor w, w, #2 + mov h, #1 + madd h, a, w, h + mul l, h, h + madd w, h, w, w + mul h, l, l + madd w, l, w, w + mul l, h, h + madd w, h, w, w + madd w, l, w, w + +// Write that as lowest word of the output, then if k = 1 we're finished + + str w, [z] + cmp k, #1 + beq bignum_negmodinv_end + +// Otherwise compute and write the other digits (1..k-1) of w * x + 1. +// Note that at this point CF was set by the comparison (subtraction) "k - 1". +// Since k >= 2 if we got here, this subtraction didn't carry; allowing +// for the inverted carry on ARM that means that CF is guaranteed to be set. +// This allows us to ignore the nominal "a * w + 1" from adding the low +// part of the product, since its only contribution is to set the carry +// flag. Thus, we only calculate the high part of a * w explicitly. + + umulh h, a, w + mov i, #1 +bignum_negmodinv_initloop: + ldr a, [x, i, lsl #3] + mul l, a, w + adcs l, l, h + umulh h, a, w + str l, [z, i, lsl #3] + add i, i, #1 + sub a, k, i + cbnz a, bignum_negmodinv_initloop + +// For simpler indexing, z := z + 8 and k := k - 1 per outer iteration +// Then we can use the same index for x and for z and effective size k. +// +// But we also offset k by 1 so the "real" size is k + 1, which is why the +// test at the end of the inner loop is i < k <=> i' = i + 1 < k + 1. +// This lets us avoid some special cases inside the loop at the cost +// of needing the additional "finale" tail for the final iteration +// since we do one outer loop iteration too few. + + subs k, k, #2 + beq bignum_negmodinv_finale + +bignum_negmodinv_outerloop: + add z, z, #8 + ldr e, [z] + mul m, e, w + str m, [z] + ldr a, [x] + umulh h, a, m + subs xzr, e, #1 // Effective carry from a * m + e + mov i, #1 +bignum_negmodinv_innerloop: + ldr a, [x, i, lsl #3] + ldr e, [z, i, lsl #3] + mul l, a, m + adcs e, e, h + umulh h, a, m + adc h, h, xzr + adds e, e, l + str e, [z, i, lsl #3] + sub a, i, k + add i, i, #1 + cbnz a, bignum_negmodinv_innerloop + + subs k, k, #1 + bne bignum_negmodinv_outerloop + +bignum_negmodinv_finale: + ldr e, [z, #8] + mul m, e, w + str m, [z, #8] + +bignum_negmodinv_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_nonzero.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_nonzero.S new file mode 100644 index 00000000000..072018af3ef --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_nonzero.S @@ -0,0 +1,44 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Test bignum for nonzero-ness x =/= 0 +// Input x[k]; output function return +// +// extern uint64_t bignum_nonzero (uint64_t k, uint64_t *x); +// +// Standard ARM ABI: X0 = k, X1 = x, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_nonzero) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_nonzero) + .text + .balign 4 + +#define k x0 +#define x x1 +#define a x2 +#define c x3 + + +S2N_BN_SYMBOL(bignum_nonzero): + +mov c, xzr // c will be or of the digits + cbz k, bignum_nonzero_end // if k = 0 skip the bignum_nonzero_loop + +bignum_nonzero_loop: + sub k, k, #1 + ldr a, [x, k, lsl #3] + orr c, c, a + cbnz k, bignum_nonzero_loop + + cmp c, xzr + cset x0, ne + +bignum_nonzero_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_normalize.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_normalize.S new file mode 100644 index 00000000000..403bb1935fb --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_normalize.S @@ -0,0 +1,108 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Normalize bignum in-place by shifting left till top bit is 1 +// Input z[k]; outputs function return (bits shifted left) and z[k] +// +// extern uint64_t bignum_normalize (uint64_t k, uint64_t *z); +// +// Given a k-digit bignum z, this function shifts it left by its number of +// leading zero bits, to give result with top bit 1, unless the input number +// was 0. The return is the same as the output of bignum_clz, i.e. the number +// of bits shifted (nominally 64 * k in the case of zero input). +// +// Standard ARM ABI: X0 = k, X1 = z, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_normalize) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_normalize) + .text + .balign 4 + +#define k x0 +#define z x1 + +// This is the return value we accumulate + +#define r x2 + +// Other variables + +#define a x3 +#define b x4 +#define c x5 +#define d x6 +#define i x7 +#define j x8 +#define l x9 + + +S2N_BN_SYMBOL(bignum_normalize): + +// If k = 0 the whole operation is trivial. Otherwise initialize +// shift count r and top digit c, but then if k = 1 skip the digitwise part + + subs i, k, #1 + bcc bignum_normalize_end + ldr c, [z, i, lsl #3] + mov r, xzr + beq bignum_normalize_bitpart + +// Do a rather stupid but constant-time digit normalization, conditionally +// shifting left (k-1) times based on whether the top word is zero. +// With careful binary striding this could be O(k*log(k)) instead of O(k^2) +// while still retaining the constant-time style. + +bignum_normalize_normloop: + mov j, xzr + cmp c, xzr + cinc r, r, eq + mov a, xzr +bignum_normalize_shufloop: + mov c, a + ldr a, [z, j, lsl #3] + csel c, c, a, eq + str c, [z, j, lsl #3] + add j, j, #1 + sub d, j, k + cbnz d, bignum_normalize_shufloop + subs i, i, #1 + bne bignum_normalize_normloop + +// We now have the top digit nonzero, assuming the input was nonzero, +// and as per the invariant of the loop above, c holds that digit. So +// now just count c's leading zeros and shift z bitwise that many bits. + +bignum_normalize_bitpart: + lsl r, r, #6 + clz c, c + add r, r, c + + mov b, xzr + mov i, xzr + ands xzr, c, #63 + csetm l, ne + neg d, c +bignum_normalize_bitloop: + ldr j, [z, i, lsl #3] + lsl a, j, c + orr a, a, b + lsr b, j, d + and b, b, l + str a, [z, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_normalize_bitloop + +// Return the final shift count + + mov x0, r + +bignum_normalize_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_odd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_odd.S new file mode 100644 index 00000000000..54d24fd6a74 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_odd.S @@ -0,0 +1,30 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Test bignum for odd-ness +// Input x[k]; output function return +// +// extern uint64_t bignum_odd (uint64_t k, uint64_t *x); +// +// Standard ARM ABI: X0 = k, X1 = x, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_odd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_odd) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_odd): + +cbz x0, bignum_odd_end // if k = 0, that's the return! + ldr x0, [x1] + and x0, x0, #1 + +bignum_odd_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_of_word.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_of_word.S new file mode 100644 index 00000000000..b355ce79eb7 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_of_word.S @@ -0,0 +1,45 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert single digit to bignum, z := n +// Input n; output z[k] +// +// extern void bignum_of_word (uint64_t k, uint64_t *z, uint64_t n); +// +// Create a k-digit (digit=64 bits) bignum at z with value n (mod 2^k) +// where n is a word. The "mod 2^k" only matters in the degenerate k = 0 case. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = n +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_of_word) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_of_word) + .text + .balign 4 + +#define k x0 +#define z x1 +#define n x2 + + +S2N_BN_SYMBOL(bignum_of_word): + +cbz k, bignum_of_word_end // if k = 0 do nothing + + str n, [z] // Set zeroth word to n + subs k, k, #1 // k := k - 1 + beq bignum_of_word_end // and if that's 0, finish + +bignum_of_word_loop: + str xzr, [z, k, lsl #3] + subs k, k, #1 + bne bignum_of_word_loop + +bignum_of_word_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optadd.S new file mode 100644 index 00000000000..f4821128f87 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optadd.S @@ -0,0 +1,71 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally add, z := x + y (if p nonzero) or z := x (if p zero) +// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_optadd +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y); +// +// It is assumed that all numbers x, y and z have the same size k digits. +// Returns carry-out as per usual addition, always 0 if p was zero. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = p, X4 = y, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optadd) + .text + .balign 4 + +#define k x0 +#define z x1 +#define x x2 +#define p x3 +#define y x4 + +#define m x3 +#define a x5 +#define b x6 +#define i x7 + + +S2N_BN_SYMBOL(bignum_optadd): + +// if k = 0 do nothing. This is also the right top carry in X0 + + cbz k, bignum_optadd_end + +// Convert p into a strict bitmask (same register in fact) + + cmp p, xzr + csetm m, ne + +// Set i = 0 *and* make sure initial CF = 0 + + adds i, xzr, xzr + +// Main loop + +bignum_optadd_loop: + ldr a, [x, i] + ldr b, [y, i] + and b, b, m + adcs a, a, b + str a, [z, i] + add i, i, #8 + sub k, k, #1 + cbnz k, bignum_optadd_loop + +// Return carry flag + + adc x0, xzr, xzr + +bignum_optadd_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optneg.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optneg.S new file mode 100644 index 00000000000..e4507f67d6c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optneg.S @@ -0,0 +1,70 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally negate, z := -x (if p nonzero) or z := x (if p zero) +// Inputs p, x[k]; outputs function return (nonzero input) and z[k] +// +// extern uint64_t bignum_optneg +// (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x); +// +// It is assumed that both numbers x and z have the same size k digits. +// Returns a carry, which is equivalent to "x is nonzero". +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = p, X3 = x, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg) + .text + .balign 4 + +#define k x0 +#define z x1 +#define p x2 +#define x x3 + +#define a x4 +#define i x5 + + +S2N_BN_SYMBOL(bignum_optneg): + +// if k = 0 do nothing. This also has the right top carry zero in x0 + + cbz k, bignum_optneg_end + +// Convert p into a strict bitmask + + cmp p, xzr + csetm p, ne + +// Generate an initial carry-in for the negating case only to add 1; this +// is because we are actually going to do complements of the words of x + + adds xzr, p, p + +// Main loop + mov i, xzr +bignum_optneg_loop: + ldr a, [x, i] + eor a, a, p + adcs a, a, xzr + str a, [z, i] + add i, i, #8 + sub k, k, #1 + cbnz k, bignum_optneg_loop + +// Return carry flag, fixing up inversion for negative case + + adc x0, xzr, xzr + neg p, p + eor x0, x0, p + +bignum_optneg_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/generic/bignum_optsub.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optsub.S similarity index 100% rename from third_party/s2n-bignum/arm/generic/bignum_optsub.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optsub.S diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optsubadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optsubadd.S new file mode 100644 index 00000000000..0dcc10a39a5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_optsubadd.S @@ -0,0 +1,86 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed +// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_optsubadd +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y); +// +// If p has top bit set (i.e. is negative as a signed int) return z := x - y +// Else if p is nonzero (i.e. is positive as a signed int) return z := x + y +// Otherwise (i.e. p is zero) return z := x +// +// Return in X0 = the top carry, which will be 0 or 1, and appropriate for +// addition or subtraction respectively (and always zero for p = 0) +// +// 2^{64*k} * -carryout + z = x - y [for subtraction] +// 2^{64*k} * carryout + z = x + y [for addition] +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = x, X3 = p, X4 = y, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optsubadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optsubadd) + .text + .balign 4 + +#define k x0 +#define z x1 +#define x x2 +#define p x3 +#define y x4 + +#define m x3 +#define q x5 +#define a x6 +#define b x7 +#define i x8 + + +S2N_BN_SYMBOL(bignum_optsubadd): + +// if k = 0 do nothing. This is also the right top carry in X0 + + cbz k, bignum_optsubadd_end + +// Turn the input p into two bitmasks, m indicating to use the y input at +// all (same register as p) and q indicating a sign-flip + + cmp p, xzr + csetm m, ne + csetm q, mi + +// Generate an initial carry-in for the negating case only to add 1; this +// is because we are actually going to do complements of the words of y + + adds xzr, q, q + +// Main loop + + mov i, xzr +bignum_optsubadd_loop: + ldr b, [y, i] + eor b, b, q + ldr a, [x, i] + and b, b, m + adcs a, a, b + str a, [z, i] + add i, i, #8 + sub k, k, #1 + cbnz k, bignum_optsubadd_loop + +// Return carry flag, fixing up inversion for negative case + + adc x0, xzr, xzr + neg q, q + eor x0, x0, q + +bignum_optsubadd_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_pow2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_pow2.S new file mode 100644 index 00000000000..4f647a55ebd --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_pow2.S @@ -0,0 +1,60 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Return bignum of power of 2, z := 2^n +// Input n; output z[k] +// +// extern void bignum_pow2 (uint64_t k, uint64_t *z, uint64_t n); +// +// The result is as usual mod 2^{64*k}, so will be zero if n >= 64*k. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = n +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_pow2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_pow2) + .text + .balign 4 + +#define k x0 +#define z x1 +#define n x2 + +#define w x3 +#define i x4 +#define a x5 + + +S2N_BN_SYMBOL(bignum_pow2): + +// If k = 0 the result is trivially zero + + cbz k, bignum_pow2_end + +// Create the index n at which to write the nonzero word and the word w itself +// Note that the ARM manual explicitly says that shift counts are taken modulo +// the datasize, so we don't need to mask the lower 6 bits of n ourselves. + + mov w, #1 + lsl w, w, n + lsr n, n, #6 + +// Now in a constant-time fashion set the n'th word to w and others to zero + + mov i, xzr +bignum_pow2_loop: + cmp i, n + csel a, w, xzr, eq + str a, [z, i, lsl #3] + add i, i, #1 + cmp i, k + bcc bignum_pow2_loop + +bignum_pow2_end: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_shl_small.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_shl_small.S new file mode 100644 index 00000000000..77cf097b006 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_shl_small.S @@ -0,0 +1,99 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Shift bignum left by c < 64 bits z := x * 2^c +// Inputs x[n], c; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_shl_small +// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c); +// +// Does the "z := x << c" operation where x is n digits, result z is p. +// The shift count c is masked to 6 bits so it actually uses c' = c mod 64. +// The return value is the "next word" of a p+1 bit result, if n <= p. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x, X4 = c, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_shl_small) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_shl_small) + .text + .balign 4 + +#define p x0 +#define z x1 +#define n x2 +#define x x3 +#define c x4 + +#define d x5 +#define a x6 +#define b x7 +#define m x8 +#define t x9 +#define i x10 + + +S2N_BN_SYMBOL(bignum_shl_small): + +// First clamp the input size n := min(p,n) since we can never need to read +// past the p'th term of the input to generate p-digit output. + + cmp n, p + csel n, p, n, cs + +// Initialize counter i and "previous word" carry b to zero +// and skip main loop if n = 0 + + mov b, xzr + mov i, xzr + cbz n, bignum_shl_small_tail + +// Set up a mask for nonzero shift and a negated version of the shift. +// Note that all basic word-level shifts are predictably masked to 6 bits. + + ands xzr, c, #63 + csetm m, ne + neg d, c + +// Now the main loop +bignum_shl_small_loop: + ldr t, [x, i, lsl #3] + lsl a, t, c + orr a, a, b + lsr b, t, d + and b, b, m + str a, [z, i, lsl #3] + add i, i, #1 + cmp i, n + bcc bignum_shl_small_loop + +// If we are at the end, finish, otherwise write carry word then zeros + +bignum_shl_small_tail: + + cmp i, p + bcs bignum_shl_small_end + str b, [z, i, lsl #3] + mov b, xzr + add i, i, #1 + cmp i, p + bcs bignum_shl_small_end + +bignum_shl_small_tloop: + str xzr, [z, i, lsl #3] + add i, i, #1 + cmp i, p + bcc bignum_shl_small_tloop + +// Return top word + +bignum_shl_small_end: + + mov x0, b + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_shr_small.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_shr_small.S new file mode 100644 index 00000000000..8ddcad9a83b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_shr_small.S @@ -0,0 +1,90 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Shift bignum right by c < 64 bits z := floor(x / 2^c) +// Inputs x[n], c; outputs function return (bits shifted out) and z[k] +// +// extern uint64_t bignum_shr_small +// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c); +// +// Does the "z := x >> c" operation where x is n digits, result z is p. +// The shift count c is masked to 6 bits so it actually uses c' = c mod 64. +// The return value is the inout mod 2^c'. +// +// Standard ARM ABI: X0 = k, X1 = z, X2 = n, X3 = x, X4 = c, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_shr_small) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_shr_small) + .text + .balign 4 + +#define p x0 +#define z x1 +#define n x2 +#define x x3 +#define c x4 + +#define d x5 +#define a x6 +#define b x7 +#define m x8 +#define t x9 + + +S2N_BN_SYMBOL(bignum_shr_small): + +// Set default carry-in word to 0 + + mov b, xzr + +// First, if p > n then pad output on the left with p-n zeros + + cmp n, p + bcs bignum_shr_small_nopad +bignum_shr_small_padloop: + sub p, p, #1 + str xzr, [z, p, lsl #3] + cmp n, p + bcc bignum_shr_small_padloop + +// We now know that p <= n. If in fact p < n let carry word = x[p] instead of 0 + +bignum_shr_small_nopad: + beq bignum_shr_small_shiftstart + ldr b, [x, p, lsl #3] +bignum_shr_small_shiftstart: + +// Set up negated version of the shift and shift b in preparation. +// Use a mask for nonzero shift to fake 64-bit left shift in zero case + + neg d, c + lsl b, b, d + ands xzr, c, #63 + csetm m, ne + and b, b, m + +// Now the main loop + + cbz p, bignum_shr_small_end +bignum_shr_small_loop: + sub p, p, #1 + ldr t, [x, p, lsl #3] + lsr a, t, c + orr a, a, b + lsl b, t, d + and b, b, m + str a, [z, p, lsl #3] + cbnz p, bignum_shr_small_loop + +// Return top word, shifted back to be a modulus + +bignum_shr_small_end: + lsr x0, b, d + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/arm/generic/bignum_sqr.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_sqr.S similarity index 100% rename from third_party/s2n-bignum/arm/generic/bignum_sqr.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_sqr.S diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_sub.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_sub.S new file mode 100644 index 00000000000..5e9e40c9550 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/bignum_sub.S @@ -0,0 +1,118 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Subtract, z := x - y +// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] +// +// extern uint64_t bignum_sub +// (uint64_t p, uint64_t *z, +// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Does the z := x - y operation, truncating modulo p words in general and +// returning a top borrow (0 or 1) in the p'th place, only subtracting input +// words below p (as well as m and n respectively) to get the diff and borrow. +// +// Standard ARM ABI: X0 = p, X1 = z, X2 = m, X3 = x, X4 = n, X5 = y, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub) + .text + .balign 4 + +#define p x0 +#define z x1 +#define m x2 +#define x x3 +#define n x4 +#define y x5 +#define i x6 +#define a x7 +#define d x8 + + +S2N_BN_SYMBOL(bignum_sub): + +// First clamp the two input sizes m := min(p,m) and n := min(p,n) since +// we'll never need words past the p'th. Can now assume m <= p and n <= p. +// Then compare the modified m and n and branch accordingly + + cmp m, p + csel m, p, m, cs + cmp n, p + csel n, p, n, cs + cmp m, n + bcc bignum_sub_ylonger + +// The case where x is longer or of the same size (p >= m >= n) + + sub p, p, m + sub m, m, n + subs i, xzr, xzr + cbz n, bignum_sub_xmainskip +bignum_sub_xmainloop: + ldr a, [x, i, lsl #3] + ldr d, [y, i, lsl #3] + sbcs a, a, d + str a, [z, i, lsl #3] + add i, i, #1 + sub n, n, #1 + cbnz n, bignum_sub_xmainloop +bignum_sub_xmainskip: + cbz m, bignum_sub_xtopskip +bignum_sub_xtoploop: + ldr a, [x, i, lsl #3] + sbcs a, a, xzr + str a, [z, i, lsl #3] + add i, i, #1 + sub m, m, #1 + cbnz m, bignum_sub_xtoploop +bignum_sub_xtopskip: + cbnz p, bignum_sub_tails + cset x0, cc + ret + +// The case where y is longer (p >= n > m) + +bignum_sub_ylonger: + sub p, p, n + sub n, n, m + subs i, xzr, xzr + cbz m, bignum_sub_ytoploop +bignum_sub_ymainloop: + ldr a, [x, i, lsl #3] + ldr d, [y, i, lsl #3] + sbcs a, a, d + str a, [z, i, lsl #3] + add i, i, #1 + sub m, m, #1 + cbnz m, bignum_sub_ymainloop +bignum_sub_ytoploop: + ldr a, [y, i, lsl #3] + sbcs a, xzr, a + str a, [z, i, lsl #3] + add i, i, #1 + sub n, n, #1 + cbnz n, bignum_sub_ytoploop +bignum_sub_ytopskip: + cbnz p, bignum_sub_tails + cset x0, cc + ret + +// Adding a non-trivial tail, when p > max(m,n) + +bignum_sub_tails: + csetm a, cc +bignum_sub_tailloop: + str a, [z, i, lsl #3] + add i, i, #1 + subs p, p, #1 + bne bignum_sub_tailloop + neg x0, a + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_bytereverse.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_bytereverse.S new file mode 100644 index 00000000000..bb892b7b0bf --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_bytereverse.S @@ -0,0 +1,39 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reverse the order of bytes in a 64-bit word +// +// extern uint64_t word_bytereverse (uint64_t a); +// +// Standard ARM ABI: X0 = a, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_bytereverse) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_bytereverse) + .text + .balign 4 + +S2N_BN_SYMBOL(word_bytereverse): + + mov x1, #0xFFFF0000FFFF0000 + mov x2, #0x0000FFFF0000FFFF + and x1, x1, x0 + and x2, x2, x0 + ror x1, x1, #32 + orr x0, x1, x2 + + mov x1, #0xFF00FF00FF00FF00 + mov x2, #0x00FF00FF00FF00FF + and x1, x1, x0 + and x2, x2, x0 + ror x1, x1, #24 + ror x2, x2, #8 + orr x0, x1, x2 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_clz.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_clz.S new file mode 100644 index 00000000000..f77eb03412f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_clz.S @@ -0,0 +1,25 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count leading zero bits in a single word +// Input a; output function return +// +// extern uint64_t word_clz (uint64_t a); +// +// Standard ARM ABI: X0 = a, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_clz) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_clz) + .text + .balign 4 + +S2N_BN_SYMBOL(word_clz): + clz x0, x0 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_ctz.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_ctz.S new file mode 100644 index 00000000000..2f7bcade862 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_ctz.S @@ -0,0 +1,37 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count trailing zero bits in a single word +// Input a; output function return +// +// extern uint64_t word_ctz (uint64_t a); +// +// Standard ARM ABI: X0 = a, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_ctz) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_ctz) + .text + .balign 4 + +S2N_BN_SYMBOL(word_ctz): + +// ARM doesn't have a direct word ctz instruction, so we emulate it via +// ctz(w) = 64 - clz(~w & (w-1)). This is depending, for cases of the form +// ctz(....1), on the behavior clz(0) = 64, which is guaranteed according +// to the ARM manual. + + mvn x1, x0 + sub x0, x0, #1 + and x0, x0, x1 + clz x1, x0 + mov x0, #64 + sub x0, x0, x1 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_divstep59.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_divstep59.S new file mode 100644 index 00000000000..a94c70cf150 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_divstep59.S @@ -0,0 +1,323 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Perform 59 "divstep" iterations and return signed matrix of updates +// Inputs d, f, g; output m[2][2] and function return (updated d) +// +// extern int64_t word_divstep59 +// (int64_t m[2][2],int64_t d,uint64_t f,uint64_t g); +// +// Standard ARM ABI: X0 = m, X1 = d, X2 = f, X3 = g, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_divstep59) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_divstep59) + .text + .balign 4 + +#define m x0 +#define d x1 +#define f x2 +#define g x3 + +#define fuv x4 +#define grs x5 +#define t x6 +#define n x7 + +#define m00 x8 +#define m01 x9 +#define m10 x10 +#define m11 x11 + +#define n00 x12 +#define n01 x13 +#define n10 x14 +#define n11 x15 + +S2N_BN_SYMBOL(word_divstep59): + +// Pack f and g into single registers with (negated) update matrices, +// initially the identity matrix. The f_lo and g_lo are initially +// the 20 lowest bits of f and g. +// +// fuv = f_lo - 2^41 * 1 - 2^62 * 0 +// grs = g_lo - 2^41 * 0 - 2^62 * 1 + + and fuv, f, #0xFFFFF + orr fuv, fuv, 0xFFFFFE0000000000 + + and grs, g, #0xFFFFF + orr grs, grs, 0xc000000000000000 + + tst grs, #1 + +// Now do 20 divsteps on that packed format. +// +// At the i'th iteration (starting at i = 0, ending at i = 20) +// the intermediate packed values are of the form +// +// fuv = f_lo - 2^{41-i} * m00 - 2^{62-i} * m01 +// grs = g_lo - 2^{41-i} * m10 - 2^{62-i} * m11 +// +// where the following matrix indicates the updates to apply +// to the original (full-sized) f and g for those iterations. +// +// [m00 m01] * [f_0] = [f_i] +// [m10 m11] [g_0] [g_i] + +.set i, 0 +.rep 20 + + csel t, fuv, xzr, ne + ccmp d, xzr, #8, ne + + cneg d, d, ge + cneg t, t, ge + csel fuv, grs, fuv, ge + + add grs, grs, t + add d, d, #2 + +.if (i< 19) + tst grs, #2 +.endif + asr grs, grs, #1 + +.set i, (i+1) +.endr + +// Extract the matrix entries, but keep them in negated form. + + add m00, fuv, #1048576 + sbfx m00, m00, #21, #21 + + mov m11, #1048576 + add m11, m11, m11, lsl #21 + add m01, fuv, m11 + asr m01, m01, #42 + + add m10, grs, #1048576 + sbfx m10, m10, #21, #21 + + add m11, grs, m11 + asr m11, m11, #42 + +// Compute updated f and g using the negated matrix entries; +// this flips the signs of f and g but it doesn't matter. +// +// f = (m00 * f + m01 * g) / 2^20 +// g = (m10 * f + m11 * g) / 2^20 +// +// Since we only need another 40 bits, we can do all of that +// computation naively using (implicitly signed) 64-bit words. + + mul t, m00, f + mul n, m01, g + mul f, m10, f + mul g, m11, g + + add fuv, t, n + add grs, f, g + + asr f, fuv, #20 + asr g, grs, #20 + +// Re-pack for 20 more rounds + + and fuv, f, #0xFFFFF + orr fuv, fuv, 0xFFFFFE0000000000 + + and grs, g, #0xFFFFF + orr grs, grs, 0xc000000000000000 + + tst grs, #1 + +// Second block of 20 divsteps in the same style + +.set i, 0 +.rep 20 + + csel t, fuv, xzr, ne + ccmp d, xzr, #8, ne + + cneg d, d, ge + cneg t, t, ge + csel fuv, grs, fuv, ge + + add grs, grs, t + add d, d, #2 + +.if (i< 19) + tst grs, #2 +.endif + asr grs, grs, #1 + +.set i, (i+1) +.endr + +// Extract the next matrix entries, in negated form again + + add n00, fuv, #1048576 + sbfx n00, n00, #21, #21 + + mov n11, #1048576 + add n11, n11, n11, lsl #21 + add n01, fuv, n11 + asr n01, n01, #42 + + add n10, grs, #1048576 + sbfx n10, n10, #21, #21 + + add n11, grs, n11 + asr n11, n11, #42 + +// Compute updated f and g using the negated matrix entries, +// and so again flipping (thus actually restoring) the signs. +// +// f = (n00 * f + n01 * g) / 2^20 +// g = (n10 * f + n11 * g) / 2^20 + + mul t, n00, f + mul n, n01, g + mul f, n10, f + mul g, n11, g + + add fuv, t, n + add grs, f, g + + asr f, fuv, #20 + asr g, grs, #20 + +// Re-pack for 19 more rounds + + and fuv, f, #0xFFFFF + orr fuv, fuv, 0xFFFFFE0000000000 + + and grs, g, #0xFFFFF + orr grs, grs, 0xc000000000000000 + + tst grs, #1 + +// Split the last divsteps into two blocks of 10 and 9 to insert the matrix +// multiplication in between them. The first ten iterations: + +.set i, 0 +.rep 10 + + csel t, fuv, xzr, ne + ccmp d, xzr, #8, ne + + cneg d, d, ge + cneg t, t, ge + csel fuv, grs, fuv, ge + + add grs, grs, t + add d, d, #2 + + tst grs, #2 + asr grs, grs, #1 + +.set i, (i+1) +.endr + +// Multiply the first two matrices. +// +// [m00 m01] = [n00 n01] * [m00 m01] +// [m10 m11] [n10 n11] [m10 m11] +// +// The resulting matrix entries are: +// +// m00' = n00 * m00 + n01 * m10 +// m01' = n00 * m01 + n01 * m11 +// m10' = n10 * m00 + n11 * m10 +// m11' = n10 * m01 + n11 * m11 + + mul f, n00, m00 + mul g, n00, m01 + mul t, n10, m00 + mul n, n10, m01 + + madd m00, n01, m10, f + madd m01, n01, m11, g + madd m10, n11, m10, t + madd m11, n11, m11, n + +// Now the final 9 divsteps + +.rep 9 + + csel t, fuv, xzr, ne + ccmp d, xzr, #8, ne + + cneg d, d, ge + cneg t, t, ge + csel fuv, grs, fuv, ge + + add grs, grs, t + add d, d, #2 + +.if (i< 18) + tst grs, #2 +.endif + asr grs, grs, #1 + +.set i, (i+1) +.endr + +// Extract the matrix entries from the final 19 divsteps + + add n00, fuv, #1048576 + sbfx n00, n00, #22, #21 + + mov n11, #1048576 + add n11, n11, n11, lsl #21 + add n01, fuv, n11 + asr n01, n01, #43 + + add n10, grs, #1048576 + sbfx n10, n10, #22, #21 + + add n11, grs, n11 + asr n11, n11, #43 + +// Multiply by this new matrix +// +// [m00 m01] = [n00 n01] * [m00 m01] +// [m10 m11] [n10 n11] [m10 m11] +// +// The resulting matrix entries are: +// +// m00' = n00 * m00 + n01 * m10 +// m01' = n00 * m01 + n01 * m11 +// m10' = n10 * m00 + n11 * m10 +// m11' = n10 * m01 + n11 * m11 +// +// Since we didn't negate the n matrix, all products are negated +// here using "mneg" and "msub" in place of "mul" and "madd", so +// we have the correct sign for the returned composite matrix. + + mneg f, n00, m00 + mneg g, n00, m01 + mneg fuv, n10, m00 + mneg grs, n10, m01 + + msub m00, n01, m10, f + msub m01, n01, m11, g + msub m10, n11, m10, fuv + msub m11, n11, m11, grs + +// Finally store back and return final d. + + stp m00, m01, [m] + stp m10, m11, [m, #16] + + mov x0, d + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_max.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_max.S new file mode 100644 index 00000000000..2f6a2ea5cac --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_max.S @@ -0,0 +1,30 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Return maximum of two unsigned 64-bit words +// Inputs a, b; output function return +// +// extern uint64_t word_max (uint64_t a, uint64_t b); +// +// Standard ARM ABI: X0 = a, X1 = b, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_max) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_max) + .text + .balign 4 + +#define a x0 +#define b x1 + +S2N_BN_SYMBOL(word_max): + + cmp a, b + csel x0, a, b, cs + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_min.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_min.S new file mode 100644 index 00000000000..774538a6b79 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_min.S @@ -0,0 +1,30 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Return minimum of two unsigned 64-bit words +// Inputs a, b; output function return +// +// extern uint64_t word_min (uint64_t a, uint64_t b); +// +// Standard ARM ABI: X0 = a, X1 = b, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_min) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_min) + .text + .balign 4 + +#define a x0 +#define b x1 + +S2N_BN_SYMBOL(word_min): + + cmp a, b + csel x0, a, b, cc + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_negmodinv.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_negmodinv.S new file mode 100644 index 00000000000..f44b3e885cb --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_negmodinv.S @@ -0,0 +1,78 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Single-word negated modular inverse (-1/a) mod 2^64 +// Input a; output function return +// +// extern uint64_t word_negmodinv (uint64_t a); +// +// A 64-bit function that returns a negated multiplicative inverse mod 2^64 +// of its input, assuming that input is odd. Given odd input a, the result z +// will satisfy a * z + 1 == 0 (mod 2^64), i.e. a 64-bit word multiplication +// a * z will give -1. +// +// Standard ARM ABI: X0 = a, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_negmodinv) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_negmodinv) + .text + .balign 4 + +// Use some more intuitive variable names but these in general are aliased +// to each other so need care when interpreting. Overall we only use the +// registers x0, x1 and x2. +// +// There does seem a slight efficiency advantage in putting e' = e^2 +// before the x' = x (1 + e) each time. That's the only reason for not +// reversing those and hence being able to alias all the e values to the +// same register. + +#define a x0 +#define x x1 +#define one x2 + +#define e1 x2 +#define e2 x0 +#define e4 x2 +#define e8 x0 + +S2N_BN_SYMBOL(word_negmodinv): + +// Initial magical 5-bit approximation x = (a - a<<2) xor 2 + + lsl x, a, #2 + sub x, a, x + eor x, x, #2 + +// Get error e = a * x + 1 for subsequent correction steps + + mov one, #1 + madd e1, a, x, one + +// e2 = e^2, x' = x (1 + e) is good to 10 bits + + mul e2, e1, e1 + madd x, e1, x, x + +// e4 = e^4, x' = x (1 + e^2) is good to 20 bits + + mul e4, e2, e2 + madd x, e2, x, x + +// e8 = e^8, x' = x (1 + e^4) is good to 40 bits + + mul e8, e4, e4 + madd x, e4, x, x + +// Final x' = x (1 + e^8) is good to the 64-bit word size + + madd x0, e8, x, x + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_popcount.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_popcount.S new file mode 100644 index 00000000000..04d5a3b2957 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_popcount.S @@ -0,0 +1,41 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count number of set bits in a single 64-bit word (population count) +// Input a; output function return +// +// extern uint64_t word_popcount (uint64_t a); +// +// Standard ARM ABI: X0 = a, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_popcount) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_popcount) + .text + .balign 4 + +// Very similar to the traditional algorithm, e.g. Hacker's Delight 5-2 + +S2N_BN_SYMBOL(word_popcount): + + and x1, x0, #0xAAAAAAAAAAAAAAAA + sub x0, x0, x1, lsr #1 + + bic x1, x0, #0x3333333333333333 + and x0, x0, #0x3333333333333333 + add x0, x0, x1, lsr #2 + + add x0, x0, x0, lsr #4 + and x0, x0, #0x0F0F0F0F0F0F0F0F + + mov x1, #0x101010101010101 + mul x0, x0, x1 + lsr x0, x0, #56 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_recip.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_recip.S new file mode 100644 index 00000000000..f4fc72056d4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/generic/word_recip.S @@ -0,0 +1,119 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Single-word reciprocal, underestimate of floor(2^128 / a) - 2^64 +// Input a; output function return +// +// extern uint64_t word_recip (uint64_t a); +// +// Given an input word "a" with its top bit set (i.e. 2^63 <= a < 2^64), the +// result "x" is implicitly augmented with a leading 1 giving x' = 2^64 + x. +// The result is x' = ceil(2^128 / a) - 1, which except for the single +// special case a = 2^63 is the same thing as x' = floor(2^128 / a). +// +// Standard ARM ABI: X0 = a, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_recip) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_recip) + .text + .balign 4 + +#define a x0 +#define x x1 + +// Some of these are aliased for clarity + +#define b x2 + +#define t x3 +#define l x3 + +#define d x4 +#define h x4 + +S2N_BN_SYMBOL(word_recip): + +// Scale the input down: b overestimates a/2^16 with b <= 2^48 and +// x underestimates 2^64/b with b * x =~= 2^64, accurate to ~2 bits. + + lsr b, a, #16 + eor x, b, #0x1FFFFFFFFFFFF + add b, b, #1 + lsr x, x, #32 + +// Suppose x = 2^64/b * (1 - e). and get scaled error d = 2^64 * e + + msub d, b, x, xzr + +// Rescale to give c = 2^15 * e (so c <= 2^13) and compute +// e + e^2 + e^3 + e^4 = (1 + e^2) (e + e^2) +// = (2^30 + c^2) * (2^15 * c + c^2) / 2^60 +// and then x * (1 + e + e^2 + e^3 + e^4) +// = (2^30 * x + x * (2^30 + c^2) * (2^30 * c + c^2) / 2^30) / 2^30 + + lsr t, d, #49 + mul t, t, t + lsr d, d, #34 + add d, t, d + orr t, t, #0x40000000 + mul t, d, t + lsr t, t, #30 + lsl d, x, #30 + madd x, x, t, d + lsr x, x, #30 + +// Now b * x =~= 2^64, accurate to ~10 bits. +// Do a 64-bit Newton step, scaling up x by 16 bits in the process. + + msub d, b, x, xzr + lsr d, d, #24 + mul d, d, x + lsl x, x, #16 + lsr d, d, #24 + add x, x, d + +// Now b * x =~= 2^80, accurate to ~20 bits. +// Do a 64-bit Newton step, scaling up x by 31 bits in the process + + msub d, b, x, xzr + lsr d, d, #32 + mul d, d, x + lsl x, x, #31 + lsr d, d, #17 + add x, x, d + +// Now a * x =~= 2^127, accurate to ~40 bits. Do a Newton step at full size. +// Instead of literally negating the product (h,l) we complement bits in +// the extracted bitfield, which is close enough and a bit faster. +// At the end we also shift x one more bit left, losing the known-1 top bit +// so that a * (2^64 + x) =~= 2^128. + + mul l, a, x + umulh h, a, x + extr l, h, l, #60 + lsr h, x, #33 + mvn l, l + mul l, h, l + lsl x, x, #1 + lsr l, l, #33 + add x, x, l + +// Test if (x' + 1) * a < 2^128 where x' = 2^64 + x, catching the special +// case where x + 1 would wrap, corresponding to input a = 2^63. + + adds t, x, #1 + cinv t, t, eq + umulh h, a, t + adds h, h, a + +// Select either x or x + 1 accordingly as the final answer + + csel x0, x, t, cs + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/Makefile new file mode 100644 index 00000000000..4489fbc1665 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/Makefile @@ -0,0 +1,66 @@ +############################################################################# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +############################################################################# + +# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise +# use a cross-assembling version so that the code can still be assembled +# and the proofs checked against the object files (though you won't be able +# to run code without additional emulation infrastructure). The aarch64 +# cross-assembling version can be installed manually by something like: +# +# sudo apt-get install binutils-aarch64-linux-gnu + +UNAME_RESULT=$(shell uname -p) + +ifeq ($(UNAME_RESULT),aarch64) +GAS=as +else +GAS=aarch64-linux-gnu-as +endif + +# List of object files + +OBJ = bignum_add_p256.o \ + bignum_bigendian_4.o \ + bignum_cmul_p256.o \ + bignum_deamont_p256.o \ + bignum_demont_p256.o \ + bignum_double_p256.o \ + bignum_half_p256.o \ + bignum_inv_p256.o \ + bignum_littleendian_4.o \ + bignum_mod_n256.o \ + bignum_mod_n256_4.o \ + bignum_mod_p256.o \ + bignum_mod_p256_4.o \ + bignum_montinv_p256.o \ + bignum_montmul_p256.o \ + bignum_montmul_p256_alt.o \ + bignum_montsqr_p256.o \ + bignum_montsqr_p256_alt.o \ + bignum_mux_4.o \ + bignum_neg_p256.o \ + bignum_nonzero_4.o \ + bignum_optneg_p256.o \ + bignum_sub_p256.o \ + bignum_tomont_p256.o \ + bignum_triple_p256.o \ + p256_montjadd.o \ + p256_montjadd_alt.o \ + p256_montjdouble.o \ + p256_montjdouble_alt.o \ + p256_montjmixadd.o \ + p256_montjmixadd_alt.o \ + p256_montjscalarmul.o \ + p256_montjscalarmul_alt.o \ + p256_scalarmul.o \ + p256_scalarmul_alt.o \ + p256_scalarmulbase.o \ + p256_scalarmulbase_alt.o + +%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - + +default: $(OBJ); + +clean:; rm -f *.o *.correct unopt/*.o diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_add_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_add_p256.S new file mode 100644 index 00000000000..e739659cebd --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_add_p256.S @@ -0,0 +1,73 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_add_p256 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p256) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 +#define c x3 +#define d0 x4 +#define d1 x5 +#define d2 x6 +#define d3 x7 +#define n0 x8 +#define n1 x9 +#define n2 x10 +#define n3 x11 + +S2N_BN_SYMBOL(bignum_add_p256): + +// First just add the numbers as [c;d3;d2;d1;d0] + + ldp d0, d1, [x] + ldp n0, n1, [y] + adds d0, d0, n0 + adcs d1, d1, n1 + ldp d2, d3, [x, #16] + ldp n2, n3, [y, #16] + adcs d2, d2, n2 + adcs d3, d3, n3 + adc c, xzr, xzr + +// Now let [c;n3;n2;n1;n0] = [c;d3;d2;d1;d0] - p_256 + + subs n0, d0, #0xffffffffffffffff + mov n1, #0x00000000ffffffff + sbcs n1, d1, n1 + sbcs n2, d2, xzr + mov n3, #0xffffffff00000001 + sbcs n3, d3, n3 + sbcs c, c, xzr + +// Select result according to whether (x + y) - p_256 < 0 + + csel d0, d0, n0, cc + csel d1, d1, n1, cc + csel d2, d2, n2, cc + csel d3, d3, n3, cc + +// Store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_bigendian_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_bigendian_4.S new file mode 100644 index 00000000000..c19c799048b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_bigendian_4.S @@ -0,0 +1,136 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert 4-digit (256-bit) bignum to/from big-endian form +// Input x[4]; output z[4] +// +// extern void bignum_bigendian_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The same function is given two other prototypes whose names reflect the +// treatment of one or other argument as a byte array rather than word array: +// +// extern void bignum_frombebytes_4 +// (uint64_t z[static 4], uint8_t x[static 32]); +// +// extern void bignum_tobebytes_4 +// (uint8_t z[static 32], uint64_t x[static 4]); +// +// The implementation works by loading in bytes and storing in words (i.e. +// stylistically it is "frombebytes"); in the more common little-endian +// usage of ARM, this is just byte reversal. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_bigendian_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_bigendian_4) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_frombebytes_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_frombebytes_4) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tobebytes_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tobebytes_4) + + .text + .balign 4 + +#define z x0 +#define x x1 + +#define d x2 +#define dshort w2 +#define a x3 +#define c x4 + +// The reads and writes are organized in mirror-image pairs (0-3 and 1-2) +// to allow x and z to point to the same buffer without using more +// intermediate registers. + +S2N_BN_SYMBOL(bignum_bigendian_4): +S2N_BN_SYMBOL(bignum_frombebytes_4): +S2N_BN_SYMBOL(bignum_tobebytes_4): + +// 0 and 3 words + + ldrb dshort, [x, #7] + extr a, d, xzr, #8 + ldrb dshort, [x, #6] + extr a, d, a, #8 + ldrb dshort, [x, #5] + extr a, d, a, #8 + ldrb dshort, [x, #4] + extr a, d, a, #8 + ldrb dshort, [x, #3] + extr a, d, a, #8 + ldrb dshort, [x, #2] + extr a, d, a, #8 + ldrb dshort, [x, #1] + extr a, d, a, #8 + ldrb dshort, [x] + extr a, d, a, #8 + + ldrb dshort, [x, #31] + extr c, d, xzr, #8 + ldrb dshort, [x, #30] + extr c, d, c, #8 + ldrb dshort, [x, #29] + extr c, d, c, #8 + ldrb dshort, [x, #28] + extr c, d, c, #8 + ldrb dshort, [x, #27] + extr c, d, c, #8 + ldrb dshort, [x, #26] + extr c, d, c, #8 + ldrb dshort, [x, #25] + extr c, d, c, #8 + ldrb dshort, [x, #24] + extr c, d, c, #8 + + str a, [z, #24] + str c, [z] + +// 1 and 2 words + + ldrb dshort, [x, #15] + extr a, d, xzr, #8 + ldrb dshort, [x, #14] + extr a, d, a, #8 + ldrb dshort, [x, #13] + extr a, d, a, #8 + ldrb dshort, [x, #12] + extr a, d, a, #8 + ldrb dshort, [x, #11] + extr a, d, a, #8 + ldrb dshort, [x, #10] + extr a, d, a, #8 + ldrb dshort, [x, #9] + extr a, d, a, #8 + ldrb dshort, [x, #8] + extr a, d, a, #8 + + ldrb dshort, [x, #23] + extr c, d, xzr, #8 + ldrb dshort, [x, #22] + extr c, d, c, #8 + ldrb dshort, [x, #21] + extr c, d, c, #8 + ldrb dshort, [x, #20] + extr c, d, c, #8 + ldrb dshort, [x, #19] + extr c, d, c, #8 + ldrb dshort, [x, #18] + extr c, d, c, #8 + ldrb dshort, [x, #17] + extr c, d, c, #8 + ldrb dshort, [x, #16] + extr c, d, c, #8 + + str a, [z, #16] + str c, [z, #8] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_cmul_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_cmul_p256.S new file mode 100644 index 00000000000..e0c231f7bf7 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_cmul_p256.S @@ -0,0 +1,131 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming +// x reduced +// Inputs c, x[4]; output z[4] +// +// extern void bignum_cmul_p256 +// (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = c, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256_alt) + .text + .balign 4 + +#define z x0 +#define m x1 +#define x x2 + +#define d0 x3 +#define d1 x4 +#define d2 x5 +#define d3 x6 +#define h x7 +#define c x8 +#define a0 x9 +#define a1 x10 +#define a2 x11 + +// These are the same! +#define a3 x12 +#define q x12 + + +S2N_BN_SYMBOL(bignum_cmul_p256): + +S2N_BN_SYMBOL(bignum_cmul_p256_alt): + +// First do the multiply, straightforwardly to [d;d3;d2;d1;d0] + + ldp a0, a1, [x] + ldp a2, a3, [x, #16] + mul d0, m, a0 + mul d1, m, a1 + mul d2, m, a2 + mul d3, m, a3 + umulh a0, m, a0 + umulh a1, m, a1 + umulh a2, m, a2 + umulh h, m, a3 + adds d1, d1, a0 + adcs d2, d2, a1 + adcs d3, d3, a2 + adcs h, h, xzr + +// Writing the product as z = 2^256 * h + 2^192 * l + t = 2^192 * hl + t, our +// intended quotient approximation is (hl + hl>>32 + 1)>>64. Note that by +// hypothesis our product is <= (2^64 - 1) * (p_256 - 1), so there is no need +// to max this out to avoid wrapping, unlike in the more general case of +// bignum_mod_p256. + + subs xzr, xzr, xzr // set carry flag for +1 + extr q, h, d3, #32 + adcs xzr, d3, q + lsr q, h, #32 + adcs q, h, q + +// It's easy to see -p_256 <= z - q * p_256 < p_256, so we just need to +// subtract q * p_256 and then correct if that is negative by adding p_256. +// We want z - q * p_256 +// = (z + 2^224 * q) - (2^256 + 2^192 + 2^96 - 1) * q +// +// We just do that computation in 5 words, freely ignoring the carry, +// since we have plenty to make our later decision just based on one bit, +// so one extra word is ample. + +// First do [a2;a1] = 2^32 * q, which we use twice + + lsl a1, q, #32 + lsr a2, q, #32 + +// Add that to hl, hence including the 2^224 * q part + + adds d3, d3, a1 + adc h, h, a2 + +// Now accumulate [a2;a1;a0] = (2^96 - 1) * q. +// Remember q might be zero so we truly need a (short) carry chain here. + + subs a0, xzr, q + sbcs a1, a1, xzr + sbc a2, a2, xzr + +// Hence load remaining digits and do the subtraction + + subs d0, d0, a0 + sbcs d1, d1, a1 + sbcs d2, d2, a2 + sbcs d3, d3, q + sbcs c, h, q + +// Now our top word is either zero or all 1s, and we use this to discriminate +// whether a correction is needed because our result is negative +// So correct by adding masked p_256 + + adds d0, d0, c + mov h, #0x00000000ffffffff + and h, h, c + adcs d1, d1, h + adcs d2, d2, xzr + mov h, #0xffffffff00000001 + and h, h, c + adc d3, d3, h + +// Finally store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_deamont_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_deamont_p256.S new file mode 100644 index 00000000000..783be66f845 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_deamont_p256.S @@ -0,0 +1,115 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_deamont_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form, +// "almost" meaning any 4-digit input will work, with no range restriction. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p256) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p256_alt) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1] and generating d4 from zero, re-using +// d0 as a temporary internally together with t0, t1 and t2. +// It is fine for d4 to be the same register as d0, and it often is. +// --------------------------------------------------------------------------- + +#define montreds(d4,d3,d2,d1,d0, t2,t1,t0) \ +/* Let w = d0, the original word we use as offset; d0 gets recycled */ \ +/* First let [t2;t1] = 2^32 * w */ \ +/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0) */ \ + lsl t1, d0, #32 __LF \ + subs t0, d0, t1 __LF \ + lsr t2, d0, #32 __LF \ + sbc d0, d0, t2 __LF \ +/* Hence [d4;..;d1] := [d3;d2;d1;0] + (2^256 - 2^224 + 2^192 + 2^96) * w */ \ + adds d1, d1, t1 __LF \ + adcs d2, d2, t2 __LF \ + adcs d3, d3, t0 __LF \ + adc d4, d0, xzr + +// Input parameters + +#define z x0 +#define x x1 + +// Rotating registers for the intermediate windows (with repetitions) + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 + +// Other temporaries + +#define u x6 +#define v x7 +#define w x8 + +S2N_BN_SYMBOL(bignum_deamont_p256): + +S2N_BN_SYMBOL(bignum_deamont_p256_alt): + +// Set up an initial window with the input x and an extra leading zero + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Systematically scroll left doing 1-step reductions + + montreds(d0,d3,d2,d1,d0, u,v,w) + + montreds(d1,d0,d3,d2,d1, u,v,w) + + montreds(d2,d1,d0,d3,d2, u,v,w) + + montreds(d3,d2,d1,d0,d3, u,v,w) + +// Now compare end result in [d3;d2;d1;d0] with p_256 = [w; 0; v; -1] + + mov v, #0x00000000ffffffff + mov w, #0xffffffff00000001 + + subs xzr, d0, #-1 + sbcs xzr, d1, v + sbcs xzr, d2, xzr + sbcs xzr, d3, w + +// Convert the condition [d3;d2;d1;d0] >= p_256 into a bitmask +// and do a masked subtraction + + csetm u, cs + + subs d0, d0, u + and v, v, u + sbcs d1, d1, v + sbcs d2, d2, xzr + and w, w, u + sbc d3, d3, w + +// Write back result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_demont_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_demont_p256.S new file mode 100644 index 00000000000..36ea7ec2a9a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_demont_p256.S @@ -0,0 +1,93 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_demont_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// This assumes the input is < p_256 for correctness. If this is not the case, +// use the variant "bignum_deamont_p256" instead. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_p256) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_p256_alt) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1] and generating d4 from zero, re-using +// d0 as a temporary internally together with t0, t1 and t2. +// It is fine for d4 to be the same register as d0, and it often is. +// --------------------------------------------------------------------------- + +#define montreds(d4,d3,d2,d1,d0, t2,t1,t0) \ +/* Let w = d0, the original word we use as offset; d0 gets recycled */ \ +/* First let [t2;t1] = 2^32 * w */ \ +/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0) */ \ + lsl t1, d0, #32 __LF \ + subs t0, d0, t1 __LF \ + lsr t2, d0, #32 __LF \ + sbc d0, d0, t2 __LF \ +/* Hence [d4;..;d1] := [d3;d2;d1;0] + (2^256 - 2^224 + 2^192 + 2^96) * w */ \ + adds d1, d1, t1 __LF \ + adcs d2, d2, t2 __LF \ + adcs d3, d3, t0 __LF \ + adc d4, d0, xzr + +// Input parameters + +#define z x0 +#define x x1 + +// Rotating registers for the intermediate windows (with repetitions) + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 + +// Other temporaries + +#define u x6 +#define v x7 +#define w x8 + +S2N_BN_SYMBOL(bignum_demont_p256): + +S2N_BN_SYMBOL(bignum_demont_p256_alt): + +// Set up an initial window with the input x and an extra leading zero + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Systematically scroll left doing 1-step reductions + + montreds(d0,d3,d2,d1,d0, u,v,w) + + montreds(d1,d0,d3,d2,d1, u,v,w) + + montreds(d2,d1,d0,d3,d2, u,v,w) + + montreds(d3,d2,d1,d0,d3, u,v,w) + +// Write back result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_double_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_double_p256.S new file mode 100644 index 00000000000..1d7e3460d08 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_double_p256.S @@ -0,0 +1,73 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_double_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_p256) + .text + .balign 4 + +#define z x0 +#define x x1 +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define c x6 +#define n0 x7 +#define n1 x8 +#define n2 x9 +#define n3 x10 + + +S2N_BN_SYMBOL(bignum_double_p256): + +// Double the input number as 2 * x = c + [d3; d2; d1; d0] +// It's worth considering doing this with extr...63 instead + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + adds d0, d0, d0 + adcs d1, d1, d1 + adcs d2, d2, d2 + adcs d3, d3, d3 + adc c, xzr, xzr + +// Subtract p_256 to give 2 * x - p_256 = c + [n3; n2; n1; n0] + + subs n0, d0, #0xffffffffffffffff + mov n1, #0x00000000ffffffff + sbcs n1, d1, n1 + sbcs n2, d2, xzr + mov n3, #0xffffffff00000001 + sbcs n3, d3, n3 + sbcs c, c, xzr + +// Now CF is set (because of inversion) if 2 * x >= p_256, in which case the +// correct result is [n3; n2; n1; n0], otherwise [d3; d2; d1; d0] + + csel d0, d0, n0, cc + csel d1, d1, n1, cc + csel d2, d2, n2, cc + csel d3, d3, n3, cc + +// Store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_half_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_half_p256.S new file mode 100644 index 00000000000..e2612ac27cf --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_half_p256.S @@ -0,0 +1,71 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_half_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_half_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_half_p256) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define d4 x6 +#define m x7 +#define n x8 + +S2N_BN_SYMBOL(bignum_half_p256): + +// Load the 4 digits of x + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Get a bitmask corresponding to the lowest bit of the input + + and m, d0, #1 + neg m, m + +// Do a masked addition of p_256, catching carry in a 5th word + + adds d0, d0, m + and n, m, #0x00000000ffffffff + adcs d1, d1, n + adcs d2, d2, xzr + and n, m, #0xffffffff00000001 + adcs d3, d3, n + adc d4, xzr, xzr + +// Now shift that sum right one place + + extr d0, d1, d0, #1 + extr d1, d2, d1, #1 + extr d2, d3, d2, #1 + extr d3, d4, d3, #1 + +// Store back + + stp d0, d1, [z] + stp d2, d3, [z, #16] + +// Return + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_inv_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_inv_p256.S new file mode 100644 index 00000000000..489a9d5f6b2 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_inv_p256.S @@ -0,0 +1,1274 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 +// Input x[4]; output z[4] +// +// extern void bignum_inv_p256(uint64_t z[static 4],uint64_t x[static 4]); +// +// If the 4-digit input x is coprime to p_256, i.e. is not divisible +// by it, returns z < p_256 such that x * z == 1 (mod p_256). Note that +// x does not need to be reduced modulo p_256, but the output always is. +// If the input is divisible (i.e. is 0 or p_256), then there can be no +// modular inverse and z = 0 is returned. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p256) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Used for the return pointer + +#define res x20 + +// Loop counter and d = 2 * delta value for divstep + +#define i x21 +#define d x22 + +// Registers used for matrix element magnitudes and signs + +#define m00 x10 +#define m01 x11 +#define m10 x12 +#define m11 x13 +#define s00 x14 +#define s01 x15 +#define s10 x16 +#define s11 x17 + +// Initial carries for combinations + +#define car0 x9 +#define car1 x19 + +// Input and output, plain registers treated according to pattern + +#define reg0 x0, #0 +#define reg1 x1, #0 +#define reg2 x2, #0 +#define reg3 x3, #0 +#define reg4 x4, #0 + +#define x x1, #0 +#define z x0, #0 + +// Pointer-offset pairs for temporaries on stack + +#define f sp, #0 +#define g sp, #(6*N) +#define u sp, #(12*N) +#define v sp, #(16*N) + +// Total size to reserve on the stack + +#define NSPACE #(20*N) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro. Takes input in +// [d4;d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to +// the existing [d4;d3;d2;d1], and re-using d0 as a temporary internally +// as well as t0, t1, t2. This is almost-Montgomery, i.e. the result fits +// in 4 digits but is not necessarily strictly reduced mod p_256. +// --------------------------------------------------------------------------- + +#define amontred(d4,d3,d2,d1,d0, t2,t1,t0) \ +/* We only know the input is -2^316 < x < 2^316. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_256. */ \ + mov t0, #0xe000000000000000 __LF \ + adds d0, d0, t0 __LF \ + sbcs d1, d1, xzr __LF \ + mov t1, #0x000000001fffffff __LF \ + adcs d2, d2, t1 __LF \ + mov t2, #0x2000000000000000 __LF \ + adcs d3, d3, t2 __LF \ + mov t0, #0x1fffffffe0000000 __LF \ + adc d4, d4, t0 __LF \ +/* Let w = d0, the original word we use as offset; d0 gets recycled */ \ +/* First let [t2;t1] = 2^32 * w */ \ +/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0) */ \ + lsl t1, d0, #32 __LF \ + subs t0, d0, t1 __LF \ + lsr t2, d0, #32 __LF \ + sbc d0, d0, t2 __LF \ +/* Hence basic [d4;d3;d2;d1] += (2^256 - 2^224 + 2^192 + 2^96) * w */ \ + adds d1, d1, t1 __LF \ + adcs d2, d2, t2 __LF \ + adcs d3, d3, t0 __LF \ + adcs d4, d4, d0 __LF \ +/* Now capture top carry and subtract p_256 if set (almost-Montgomery) */ \ + mov t0, #0xffffffffffffffff __LF \ + mov t1, #0x00000000ffffffff __LF \ + mov t2, #0xffffffff00000001 __LF \ + csel t0, t0, xzr, cs __LF \ + csel t1, t1, xzr, cs __LF \ + csel t2, t2, xzr, cs __LF \ + subs d1, d1, t0 __LF \ + sbcs d2, d2, t1 __LF \ + sbcs d3, d3, xzr __LF \ + sbc d4, d4, t2 + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix in +// registers as follows +// +// [ m00 m01] +// [ m10 m11] + +#define divstep59() \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x8, x4, #0x100, lsl #12 __LF \ + sbfx x8, x8, #21, #21 __LF \ + mov x11, #0x100000 __LF \ + add x11, x11, x11, lsl #21 __LF \ + add x9, x4, x11 __LF \ + asr x9, x9, #42 __LF \ + add x10, x5, #0x100, lsl #12 __LF \ + sbfx x10, x10, #21, #21 __LF \ + add x11, x5, x11 __LF \ + asr x11, x11, #42 __LF \ + mul x6, x8, x2 __LF \ + mul x7, x9, x3 __LF \ + mul x2, x10, x2 __LF \ + mul x3, x11, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #21, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #42 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #21, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #42 __LF \ + mul x6, x12, x2 __LF \ + mul x7, x13, x3 __LF \ + mul x2, x14, x2 __LF \ + mul x3, x15, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + mul x2, x12, x8 __LF \ + mul x3, x12, x9 __LF \ + mul x6, x14, x8 __LF \ + mul x7, x14, x9 __LF \ + madd x8, x13, x10, x2 __LF \ + madd x9, x13, x11, x3 __LF \ + madd x16, x15, x10, x6 __LF \ + madd x17, x15, x11, x7 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #22, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #43 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #22, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #43 __LF \ + mneg x2, x12, x8 __LF \ + mneg x3, x12, x9 __LF \ + mneg x4, x14, x8 __LF \ + mneg x5, x14, x9 __LF \ + msub m00, x13, x16, x2 __LF \ + msub m01, x13, x17, x3 __LF \ + msub m10, x15, x16, x4 __LF \ + msub m11, x15, x17, x5 + +S2N_BN_SYMBOL(bignum_inv_p256): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Copy the prime and input into the main f and g variables respectively. +// Make sure x is reduced so that g <= f as assumed in the bound proof. + + mov x10, #0xffffffffffffffff + mov x11, #0x00000000ffffffff + mov x13, #0xffffffff00000001 + stp x10, x11, [f] + stp xzr, x13, [f+2*N] + str xzr, [f+4*N] + + ldp x2, x3, [x1] + subs x10, x2, x10 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #(2*N)] + sbcs x12, x4, xzr + sbcs x13, x5, x13 + + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + + stp x2, x3, [g] + stp x4, x5, [g+2*N] + str xzr, [g+4*N] + +// Also maintain reduced < 2^256 vector [u,v] such that +// [f,g] == x * 2^{5*i-50} * [u,v] (mod p_256) +// starting with [p_256,x] == x * 2^{5*0-50} * [0,2^50] (mod p_256) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. + + stp xzr, xzr, [u] + stp xzr, xzr, [u+2*N] + + mov x10, #0x0004000000000000 + stp x10, xzr, [v] + stp xzr, xzr, [v+2*N] + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special tenth iteration after a uniform +// first 9. + + mov i, #10 + mov d, #1 + b bignum_inv_p256_midloop + +bignum_inv_p256_loop: + +// Separate the matrix elements into sign-magnitude pairs + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in stable registers for the [u,v] part and do [f,g] first. + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + + and x0, m10, s10 + and x1, m11, s11 + add car1, x0, x1 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + ldr x7, [f] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [g] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + +// Digit 1 of [f,g] + + ldr x7, [f+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g] + +// Digit 2 of [f,g] + + ldr x7, [f+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+N] + +// Digits 3 and 4 of [f,g] + + ldr x7, [f+3*N] + eor x1, x7, s00 + ldr x23, [f+4*N] + eor x3, x23, s00 + and x3, x3, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [g+3*N] + eor x1, x8, s01 + ldr x24, [g+4*N] + eor x0, x24, s01 + and x0, x0, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+2*N] + extr x5, x3, x5, #59 + str x5, [f+3*N] + asr x3, x3, #59 + str x3, [f+4*N] + + eor x1, x7, s10 + eor x5, x23, s10 + and x5, x5, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + eor x0, x24, s11 + and x0, x0, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [g+2*N] + extr x2, x5, x2, #59 + str x2, [g+3*N] + asr x5, x5, #59 + str x5, [g+4*N] + +// Now the computation of the updated u and v values and their +// Montgomery reductions. A very similar accumulation except that +// the top words of u and v are unsigned and we don't shift. +// +// Digit 0 of [u,v] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v] + adc x3, x3, x1 + +// Digit 1 of [u,v] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+N] + adc x4, x4, x1 + +// Digit 2 of [u,v] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+2*N] + adc x2, x2, x1 + +// Digits 3 and 4 of u (top is unsigned) + + ldr x7, [u+3*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Montgomery reduction of u + + ldp x0, x1, [u] + ldr x6, [u+2*N] + amontred(x3,x5,x6,x1,x0, x10,x11,x14) + stp x1, x6, [u] + stp x5, x3, [u+16] + +// Digits 3 and 4 of v (top is unsigned) + + eor x1, x7, s10 + and x5, s10, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + and x0, s11, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + +// Montgomery reduction of v + + ldp x0, x1, [v] + ldr x3, [v+2*N] + amontred(x5,x2,x3,x1,x0, x10,x11,x14) + stp x1, x3, [v] + stp x2, x5, [v+16] + +bignum_inv_p256_midloop: + + mov x1, d + ldr x2, [f] + ldr x3, [g] + divstep59() + mov d, x1 + +// Next iteration + + subs i, i, #1 + bne bignum_inv_p256_loop + +// The 10th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + ldr x0, [f] + ldr x1, [g] + mul x0, x0, m00 + madd x1, x1, m01, x0 + asr x0, x1, #63 + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * [u,v] (mod p_256) +// we want to flip the sign of u according to that of f. + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + eor s00, s00, x0 + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + eor s01, s01, x0 + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + eor s10, s10, x0 + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + eor s11, s11, x0 + +// Adjust the initial value to allow for complement instead of negation + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + +// Digit 0 of [u] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + +// Digit 1 of [u] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + +// Digit 2 of [u] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + +// Digits 3 and 4 of u (top is unsigned) + + ldr x7, [u+3*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Montgomery reduction of u. This needs to be strict not "almost" +// so it is followed by an optional subtraction of p_256 + + ldp x0, x1, [u] + ldr x2, [u+2*N] + amontred(x3,x5,x2,x1,x0, x10,x11,x14) + + mov x10, #0xffffffffffffffff + subs x10, x1, x10 + mov x11, #0x00000000ffffffff + sbcs x11, x2, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x5, xzr + sbcs x13, x3, x13 + + csel x10, x1, x10, cc + csel x11, x2, x11, cc + csel x12, x5, x12, cc + csel x13, x3, x13, cc + +// Store it back to the final output + + stp x10, x11, [res] + stp x12, x13, [res, #16] + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_littleendian_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_littleendian_4.S new file mode 100644 index 00000000000..84a4aec994d --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_littleendian_4.S @@ -0,0 +1,133 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert 4-digit (256-bit) bignum to/from little-endian form +// Input x[4]; output z[4] +// +// extern void bignum_littleendian_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The same function is given two other prototypes whose names reflect the +// treatment of one or other argument as a byte array rather than word array: +// +// extern void bignum_fromlebytes_4 +// (uint64_t z[static 4], uint8_t x[static 32]); +// +// extern void bignum_tolebytes_4 +// (uint8_t z[static 32], uint64_t x[static 4]); +// +// The implementation works by loading in bytes and storing in words (i.e. +// stylistically it is "fromlebytes"); in the more common little-endian +// usage of ARM, this is just copying. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_littleendian_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_littleendian_4) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_fromlebytes_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_fromlebytes_4) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tolebytes_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tolebytes_4) + + .text + .balign 4 + +#define z x0 +#define x x1 + +#define d x2 +#define dshort w2 +#define a x3 + +S2N_BN_SYMBOL(bignum_littleendian_4): +S2N_BN_SYMBOL(bignum_fromlebytes_4): +S2N_BN_SYMBOL(bignum_tolebytes_4): + +// word 0 + + ldrb dshort, [x] + extr a, d, xzr, #8 + ldrb dshort, [x, #1] + extr a, d, a, #8 + ldrb dshort, [x, #2] + extr a, d, a, #8 + ldrb dshort, [x, #3] + extr a, d, a, #8 + ldrb dshort, [x, #4] + extr a, d, a, #8 + ldrb dshort, [x, #5] + extr a, d, a, #8 + ldrb dshort, [x, #6] + extr a, d, a, #8 + ldrb dshort, [x, #7] + extr a, d, a, #8 + str a, [z] + +// word 1 + + ldrb dshort, [x, #8] + extr a, d, xzr, #8 + ldrb dshort, [x, #9] + extr a, d, a, #8 + ldrb dshort, [x, #10] + extr a, d, a, #8 + ldrb dshort, [x, #11] + extr a, d, a, #8 + ldrb dshort, [x, #12] + extr a, d, a, #8 + ldrb dshort, [x, #13] + extr a, d, a, #8 + ldrb dshort, [x, #14] + extr a, d, a, #8 + ldrb dshort, [x, #15] + extr a, d, a, #8 + str a, [z, #8] + +// word 2 + + ldrb dshort, [x, #16] + extr a, d, xzr, #8 + ldrb dshort, [x, #17] + extr a, d, a, #8 + ldrb dshort, [x, #18] + extr a, d, a, #8 + ldrb dshort, [x, #19] + extr a, d, a, #8 + ldrb dshort, [x, #20] + extr a, d, a, #8 + ldrb dshort, [x, #21] + extr a, d, a, #8 + ldrb dshort, [x, #22] + extr a, d, a, #8 + ldrb dshort, [x, #23] + extr a, d, a, #8 + str a, [z, #16] + +// word 3 + + ldrb dshort, [x, #24] + extr a, d, xzr, #8 + ldrb dshort, [x, #25] + extr a, d, a, #8 + ldrb dshort, [x, #26] + extr a, d, a, #8 + ldrb dshort, [x, #27] + extr a, d, a, #8 + ldrb dshort, [x, #28] + extr a, d, a, #8 + ldrb dshort, [x, #29] + extr a, d, a, #8 + ldrb dshort, [x, #30] + extr a, d, a, #8 + ldrb dshort, [x, #31] + extr a, d, a, #8 + str a, [z, #24] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_n256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_n256.S new file mode 100644 index 00000000000..d99740d288f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_n256.S @@ -0,0 +1,175 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod n_256 +// Input x[k]; output z[4] +// +// extern void bignum_mod_n256 +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Reduction is modulo the group order of the NIST curve P-256. +// +// Standard ARM ABI: X0 = z, X1 = k, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256_alt) + .text + .balign 4 + +#define z x0 +#define k x1 +#define x x2 + +#define m0 x3 +#define m1 x4 +#define m2 x5 +#define m3 x6 + +#define t0 x7 +#define t1 x8 +#define t2 x9 +#define t3 x10 +#define t4 x11 + +#define n0 x12 +#define n1 x13 +#define n3 x14 + +// These two are aliased: we only load d when finished with q + +#define q x15 +#define d x15 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(bignum_mod_n256): + +S2N_BN_SYMBOL(bignum_mod_n256_alt): + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmp k, #4 + bcc bignum_mod_n256_short + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 + + sub k, k, #4 + lsl t0, k, #3 + add t0, t0, x + ldp m2, m3, [t0, #16] + ldp m0, m1, [t0] + +// Load the complicated three words of 2^256 - n_256 = [n3; 0; n1; n0] + + movbig( n0, #0x0c46, #0x353d, #0x039c, #0xdaaf) + movbig( n1, #0x4319, #0x0552, #0x58e8, #0x617b) + mov n3, #0x00000000ffffffff + +// Reduce the top 4 digits mod n_256 (a conditional subtraction of n_256) + + adds t0, m0, n0 + adcs t1, m1, n1 + adcs t2, m2, xzr + adcs t3, m3, n3 + csel m0, m0, t0, cc + csel m1, m1, t1, cc + csel m2, m2, t2, cc + csel m3, m3, t3, cc + +// Now do (k-4) iterations of 5->4 word modular reduction + + cbz k, bignum_mod_n256_writeback +bignum_mod_n256_loop: + +// Writing the input as z = 2^256 * m3 + 2^192 * m2 + t = 2^192 * h + t, our +// intended quotient approximation is MIN ((h + h>>32 + 1)>>64) (2^64 - 1). + + subs xzr, xzr, xzr // set carry flag for +1 + extr q, m3, m2, #32 + adcs xzr, m2, q + lsr q, m3, #32 + adcs q, m3, q + csetm t0, cs + orr q, q, t0 + +// [t4;t3;t2;t1;t0] = q * (2^256 - n_256) + + mul t0, n0, q + mul t1, n1, q + mul t3, n3, q + umulh t2, n0, q + adds t1, t1, t2 + umulh t2, n1, q + adc t2, t2, xzr // No carry: high of mul + {0,1} + umulh t4, n3, q + +// Compensate for 2^256 * q + + sub m3, m3, q + +// Decrement k and load the next digit (note that d aliases to q) + + sub k, k, #1 + ldr d, [x, k, lsl #3] + +// [t4;t3;t2;t1;t0] = [m3;m2;m1;m0;d] - q * n_256 + + adds t0, d, t0 + adcs t1, m0, t1 + adcs t2, m1, t2 + adcs t3, m2, t3 + adc t4, m3, t4 + +// Now our top word t4 is either zero or all 1s. Use it for a masked +// addition of n_256, which we can do by a *subtraction* of +// 2^256 - n_256 from our portion, re-using the constants + + and d, t4, n0 + subs m0, t0, d + and d, t4, n1 + sbcs m1, t1, d + sbcs m2, t2, xzr + and d, t4, n3 + sbc m3, t3, d + + cbnz k, bignum_mod_n256_loop + +// Finally write back [m3;m2;m1;m0] and return + +bignum_mod_n256_writeback: + stp m0, m1, [z] + stp m2, m3, [z, #16] + ret + +// Short case: just copy the input with zero-padding + +bignum_mod_n256_short: + mov m0, xzr + mov m1, xzr + mov m2, xzr + mov m3, xzr + + cbz k, bignum_mod_n256_writeback + ldr m0, [x] + subs k, k, #1 + beq bignum_mod_n256_writeback + ldr m1, [x, #8] + subs k, k, #1 + beq bignum_mod_n256_writeback + ldr m2, [x, #16] + b bignum_mod_n256_writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_n256_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_n256_4.S new file mode 100644 index 00000000000..4ea3c905347 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_n256_4.S @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod n_256 +// Input x[4]; output z[4] +// +// extern void bignum_mod_n256_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Reduction is modulo the group order of the NIST curve P-256. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256_4) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define n0 x2 +#define n1 x3 +#define n2 x4 +#define n3 x5 + +#define d0 x6 +#define d1 x7 +#define d2 x8 +#define d3 x9 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(bignum_mod_n256_4): + +// Load the complicated three words of n_256, the other being all 1s + + movbig( n0, #0xf3b9, #0xcac2, #0xfc63, #0x2551) + movbig( n1, #0xbce6, #0xfaad, #0xa717, #0x9e84) + mov n3, #0xffffffff00000000 + +// Load the input number + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Do the subtraction. Since word 2 of n_256 is all 1s, that can be +// done by adding zero with carry, thanks to the inverted carry. + + subs n0, d0, n0 + sbcs n1, d1, n1 + adcs n2, d2, xzr + sbcs n3, d3, n3 + +// Now if the carry is *clear* (inversion at work) the subtraction carried +// and hence we should have done nothing, so we reset each n_i = d_i + + csel n0, d0, n0, cc + csel n1, d1, n1, cc + csel n2, d2, n2, cc + csel n3, d3, n3, cc + +// Store the end result + + stp n0, n1, [z] + stp n2, n3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_p256.S new file mode 100644 index 00000000000..b53ee5b7791 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_p256.S @@ -0,0 +1,162 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_256 +// Input x[k]; output z[4] +// +// extern void bignum_mod_p256 +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Standard ARM ABI: X0 = z, X1 = k, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256_alt) + .text + .balign 4 + +#define z x0 +#define k x1 +#define x x2 + +#define m0 x3 +#define m1 x4 +#define m2 x5 +#define m3 x6 + +#define t0 x7 +#define t1 x8 +#define t2 x9 +#define t3 x10 +#define t4 x11 + +#define n1 x12 +#define n3 x13 + +#define q x14 + + +S2N_BN_SYMBOL(bignum_mod_p256): + +S2N_BN_SYMBOL(bignum_mod_p256_alt): + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmp k, #4 + bcc bignum_mod_p256_short + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 + + sub k, k, #4 + lsl t0, k, #3 + add t0, t0, x + ldp m2, m3, [t0, #16] + ldp m0, m1, [t0] + +// Load the complicated words of p_256 = [n3;0;n1;-1] + + mov n1, #0x00000000ffffffff + mov n3, #0xffffffff00000001 + +// Reduce the top 4 digits mod p_256 (a conditional subtraction of p_256) + + subs t0, m0, #-1 + sbcs t1, m1, n1 + sbcs t2, m2, xzr + sbcs t3, m3, n3 + + csel m0, m0, t0, cc + csel m1, m1, t1, cc + csel m2, m2, t2, cc + csel m3, m3, t3, cc + +// Now do (k-4) iterations of 5->4 word modular reduction + + cbz k, bignum_mod_p256_writeback +bignum_mod_p256_loop: + +// Decrement k and load the next digit as t0. We then want to reduce +// [m3;m2;m1;m0;t0] |-> [m3;m2;m1;m0]; the shuffling downwards is absorbed +// into the various ALU operations + + sub k, k, #1 + ldr t0, [x, k, lsl #3] + +// Writing the input as z = 2^256 * h + 2^192 * l + t = 2^192 * hl + t, our +// intended quotient approximation is MIN ((hl + hl>>32 + 1)>>64) (2^64 - 1). + + subs xzr, xzr, xzr // set carry flag for +1 + extr q, m3, m2, #32 + adcs xzr, m2, q + lsr q, m3, #32 + adcs q, m3, q + csetm t1, cs + orr q, q, t1 + +// First do [t2;t1] = 2^32 * q, which we use twice + + lsl t1, q, #32 + lsr t2, q, #32 + +// Add 2^224 * q to sum + + adds t3, m2, t1 + adc t4, m3, t2 + +// Accumulate [t2;t1;m3] = (2^96 - 1) * q, using m3 briefly as a temporary + + subs m3, xzr, q + sbcs t1, t1, xzr + sbc t2, t2, xzr + +// Subtract (2^256 + 2^192 + 2^96 - 1) * q + + subs t0, t0, m3 + sbcs t1, m0, t1 + sbcs t2, m1, t2 + sbcs t3, t3, q + sbc t4, t4, q + +// Use top word as mask to correct + + adds m0, t0, t4 + and t0, n1, t4 + adcs m1, t1, t0 + adcs m2, t2, xzr + and t0, n3, t4 + adc m3, t3, t0 + + cbnz k, bignum_mod_p256_loop + +// Finally write back [m3;m2;m1;m0] and return + +bignum_mod_p256_writeback: + stp m0, m1, [z] + stp m2, m3, [z, #16] + ret + +// Short case: just copy the input with zero-padding + +bignum_mod_p256_short: + mov m0, xzr + mov m1, xzr + mov m2, xzr + mov m3, xzr + + cbz k, bignum_mod_p256_writeback + ldr m0, [x] + subs k, k, #1 + beq bignum_mod_p256_writeback + ldr m1, [x, #8] + subs k, k, #1 + beq bignum_mod_p256_writeback + ldr m2, [x, #16] + b bignum_mod_p256_writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_p256_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_p256_4.S new file mode 100644 index 00000000000..0a910eccfe5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mod_p256_4.S @@ -0,0 +1,71 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_mod_p256_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256_4) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define n0 x2 +#define n1 x3 +#define n2 x4 +#define n3 x5 + +#define d0 x6 +#define d1 x7 +#define d2 x8 +#define d3 x9 + + +S2N_BN_SYMBOL(bignum_mod_p256_4): + +// Load the three nonzero words of p_256 = [n3;0;n2;n1] + + mov n0, #0xffffffffffffffff + mov n1, #0x00000000ffffffff + mov n3, #0xffffffff00000001 + +// Load the input number + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Do the subtraction. + + subs n0, d0, n0 + sbcs n1, d1, n1 + sbcs n2, d2, xzr + sbcs n3, d3, n3 + +// Now if the carry is *clear* (inversion at work) the subtraction carried +// and hence we should have done nothing, so we reset each n_i = d_i + + csel n0, d0, n0, cc + csel n1, d1, n1, cc + csel n2, d2, n2, cc + csel n3, d3, n3, cc + +// Store the end result + + stp n0, n1, [z] + stp n2, n3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montinv_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montinv_p256.S new file mode 100644 index 00000000000..46f71514aa4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montinv_p256.S @@ -0,0 +1,1303 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 +// Input x[4]; output z[4] +// +// extern void bignum_montinv_p256(uint64_t z[static 4],uint64_t x[static 4]); +// +// If the 4-digit input x is coprime to p_256, i.e. is not divisible +// by it, returns z < p_256 such that x * z == 2^512 (mod p_256). This +// is effectively "Montgomery inverse" because if we consider x and z as +// Montgomery forms of X and Z, i.e. x == 2^256 * X and z == 2^256 * Z +// (both mod p_256) then X * Z == 1 (mod p_256). That is, this function +// gives the analog of the modular inverse bignum_inv_p256 but with both +// input and output in the Montgomery domain. Note that x does not need +// to be reduced modulo p_256, but the output always is. If the input +// is divisible (i.e. is 0 or p_256), then there can be no solution to +// the congruence x * z == 2^512 (mod p_256), and z = 0 is returned. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_p256) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Used for the return pointer + +#define res x20 + +// Loop counter and d = 2 * delta value for divstep + +#define i x21 +#define d x22 + +// Registers used for matrix element magnitudes and signs + +#define m00 x10 +#define m01 x11 +#define m10 x12 +#define m11 x13 +#define s00 x14 +#define s01 x15 +#define s10 x16 +#define s11 x17 + +// Initial carries for combinations + +#define car0 x9 +#define car1 x19 + +// Input and output, plain registers treated according to pattern + +#define reg0 x0, #0 +#define reg1 x1, #0 +#define reg2 x2, #0 +#define reg3 x3, #0 +#define reg4 x4, #0 + +#define x x1, #0 +#define z x0, #0 + +// Pointer-offset pairs for temporaries on stack + +#define f sp, #0 +#define g sp, #(6*N) +#define u sp, #(12*N) +#define v sp, #(16*N) + +// Total size to reserve on the stack + +#define NSPACE #(20*N) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro. Takes input in +// [d4;d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to +// the existing [d4;d3;d2;d1], and re-using d0 as a temporary internally +// as well as t0, t1, t2. This is almost-Montgomery, i.e. the result fits +// in 4 digits but is not necessarily strictly reduced mod p_256. +// --------------------------------------------------------------------------- + +#define amontred(d4,d3,d2,d1,d0, t2,t1,t0) \ +/* We only know the input is -2^316 < x < 2^316. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_256. */ \ + mov t0, #0xe000000000000000 __LF \ + adds d0, d0, t0 __LF \ + sbcs d1, d1, xzr __LF \ + mov t1, #0x000000001fffffff __LF \ + adcs d2, d2, t1 __LF \ + mov t2, #0x2000000000000000 __LF \ + adcs d3, d3, t2 __LF \ + mov t0, #0x1fffffffe0000000 __LF \ + adc d4, d4, t0 __LF \ +/* Let w = d0, the original word we use as offset; d0 gets recycled */ \ +/* First let [t2;t1] = 2^32 * w */ \ +/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0) */ \ + lsl t1, d0, #32 __LF \ + subs t0, d0, t1 __LF \ + lsr t2, d0, #32 __LF \ + sbc d0, d0, t2 __LF \ +/* Hence basic [d4;d3;d2;d1] += (2^256 - 2^224 + 2^192 + 2^96) * w */ \ + adds d1, d1, t1 __LF \ + adcs d2, d2, t2 __LF \ + adcs d3, d3, t0 __LF \ + adcs d4, d4, d0 __LF \ +/* Now capture top carry and subtract p_256 if set (almost-Montgomery) */ \ + mov t0, #0xffffffffffffffff __LF \ + mov t1, #0x00000000ffffffff __LF \ + mov t2, #0xffffffff00000001 __LF \ + csel t0, t0, xzr, cs __LF \ + csel t1, t1, xzr, cs __LF \ + csel t2, t2, xzr, cs __LF \ + subs d1, d1, t0 __LF \ + sbcs d2, d2, t1 __LF \ + sbcs d3, d3, xzr __LF \ + sbc d4, d4, t2 + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix in +// registers as follows +// +// [ m00 m01] +// [ m10 m11] + +#define divstep59() \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x8, x4, #0x100, lsl #12 __LF \ + sbfx x8, x8, #21, #21 __LF \ + mov x11, #0x100000 __LF \ + add x11, x11, x11, lsl #21 __LF \ + add x9, x4, x11 __LF \ + asr x9, x9, #42 __LF \ + add x10, x5, #0x100, lsl #12 __LF \ + sbfx x10, x10, #21, #21 __LF \ + add x11, x5, x11 __LF \ + asr x11, x11, #42 __LF \ + mul x6, x8, x2 __LF \ + mul x7, x9, x3 __LF \ + mul x2, x10, x2 __LF \ + mul x3, x11, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #21, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #42 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #21, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #42 __LF \ + mul x6, x12, x2 __LF \ + mul x7, x13, x3 __LF \ + mul x2, x14, x2 __LF \ + mul x3, x15, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + mul x2, x12, x8 __LF \ + mul x3, x12, x9 __LF \ + mul x6, x14, x8 __LF \ + mul x7, x14, x9 __LF \ + madd x8, x13, x10, x2 __LF \ + madd x9, x13, x11, x3 __LF \ + madd x16, x15, x10, x6 __LF \ + madd x17, x15, x11, x7 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #22, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #43 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #22, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #43 __LF \ + mneg x2, x12, x8 __LF \ + mneg x3, x12, x9 __LF \ + mneg x4, x14, x8 __LF \ + mneg x5, x14, x9 __LF \ + msub m00, x13, x16, x2 __LF \ + msub m01, x13, x17, x3 __LF \ + msub m10, x15, x16, x4 __LF \ + msub m11, x15, x17, x5 + +S2N_BN_SYMBOL(bignum_montinv_p256): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Copy the prime and input into the main f and g variables respectively. +// Make sure x is reduced so that g <= f as assumed in the bound proof. + + mov x10, #0xffffffffffffffff + mov x11, #0x00000000ffffffff + mov x13, #0xffffffff00000001 + stp x10, x11, [f] + stp xzr, x13, [f+2*N] + str xzr, [f+4*N] + + ldp x2, x3, [x1] + subs x10, x2, x10 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #(2*N)] + sbcs x12, x4, xzr + sbcs x13, x5, x13 + + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + + stp x2, x3, [g] + stp x4, x5, [g+2*N] + str xzr, [g+4*N] + +// Also maintain reduced < 2^256 vector [u,v] such that +// [f,g] == x * 2^{5*i-562} * [u,v] (mod p_256) +// starting with [p_256,x] == x * 2^{5*0-562} * [0,2^562] (mod p_256) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. +// After the 10th and last iteration and sign adjustment, when +// f == 1 for in-scope cases, we have x * 2^{50-562} * u == 1, i.e. +// x * u == 2^512 as required. + + stp xzr, xzr, [u] + stp xzr, xzr, [u+2*N] + +// The starting constant 2^562 mod p_256 is +// 0x000bffffffebffff:fffbffffffefffff:ffe8000000000000:000c000000140000 +// where colons separate 64-bit subwords, least significant at the right. +// Only word number 1, value 0xffe8000000000000, is a single ARM move. + + mov x10, #0x0000000000140000 + orr x10, x10, #0x000c000000000000 + + mov x11, #0xffe8000000000000 + + movbig(x13, #0x000b, #0xffff, #0xffef, #0xffff) + orr x12, x13, #0xfff0000000000000 + and x13, x13, #0xfffffffffffbffff + + stp x10, x11, [v] + stp x12, x13, [v+2*N] + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special tenth iteration after a uniform +// first 9. + + mov i, #10 + mov d, #1 + b bignum_montinv_p256_midloop + +bignum_montinv_p256_loop: + +// Separate the matrix elements into sign-magnitude pairs + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in stable registers for the [u,v] part and do [f,g] first. + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + + and x0, m10, s10 + and x1, m11, s11 + add car1, x0, x1 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + ldr x7, [f] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [g] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + +// Digit 1 of [f,g] + + ldr x7, [f+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g] + +// Digit 2 of [f,g] + + ldr x7, [f+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+N] + +// Digits 3 and 4 of [f,g] + + ldr x7, [f+3*N] + eor x1, x7, s00 + ldr x23, [f+4*N] + eor x3, x23, s00 + and x3, x3, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [g+3*N] + eor x1, x8, s01 + ldr x24, [g+4*N] + eor x0, x24, s01 + and x0, x0, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+2*N] + extr x5, x3, x5, #59 + str x5, [f+3*N] + asr x3, x3, #59 + str x3, [f+4*N] + + eor x1, x7, s10 + eor x5, x23, s10 + and x5, x5, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + eor x0, x24, s11 + and x0, x0, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [g+2*N] + extr x2, x5, x2, #59 + str x2, [g+3*N] + asr x5, x5, #59 + str x5, [g+4*N] + +// Now the computation of the updated u and v values and their +// Montgomery reductions. A very similar accumulation except that +// the top words of u and v are unsigned and we don't shift. +// +// Digit 0 of [u,v] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v] + adc x3, x3, x1 + +// Digit 1 of [u,v] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+N] + adc x4, x4, x1 + +// Digit 2 of [u,v] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+2*N] + adc x2, x2, x1 + +// Digits 3 and 4 of u (top is unsigned) + + ldr x7, [u+3*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Montgomery reduction of u + + ldp x0, x1, [u] + ldr x6, [u+2*N] + amontred(x3,x5,x6,x1,x0, x10,x11,x14) + stp x1, x6, [u] + stp x5, x3, [u+16] + +// Digits 3 and 4 of v (top is unsigned) + + eor x1, x7, s10 + and x5, s10, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + and x0, s11, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + +// Montgomery reduction of v + + ldp x0, x1, [v] + ldr x3, [v+2*N] + amontred(x5,x2,x3,x1,x0, x10,x11,x14) + stp x1, x3, [v] + stp x2, x5, [v+16] + +bignum_montinv_p256_midloop: + + mov x1, d + ldr x2, [f] + ldr x3, [g] + divstep59() + mov d, x1 + +// Next iteration + + subs i, i, #1 + bne bignum_montinv_p256_loop + +// The 10th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + ldr x0, [f] + ldr x1, [g] + mul x0, x0, m00 + madd x1, x1, m01, x0 + asr x0, x1, #63 + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * 2^{-512} [u,v] (mod p_256) +// we want to flip the sign of u according to that of f. + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + eor s00, s00, x0 + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + eor s01, s01, x0 + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + eor s10, s10, x0 + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + eor s11, s11, x0 + +// Adjust the initial value to allow for complement instead of negation + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + +// Digit 0 of [u] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + +// Digit 1 of [u] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + +// Digit 2 of [u] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + +// Digits 3 and 4 of u (top is unsigned) + + ldr x7, [u+3*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Montgomery reduction of u. This needs to be strict not "almost" +// so it is followed by an optional subtraction of p_256 + + ldp x0, x1, [u] + ldr x2, [u+2*N] + amontred(x3,x5,x2,x1,x0, x10,x11,x14) + + mov x10, #0xffffffffffffffff + subs x10, x1, x10 + mov x11, #0x00000000ffffffff + sbcs x11, x2, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x5, xzr + sbcs x13, x3, x13 + + csel x10, x1, x10, cc + csel x11, x2, x11, cc + csel x12, x5, x12, cc + csel x13, x3, x13, cc + +// Store it back to the final output + + stp x10, x11, [res] + stp x12, x13, [res, #16] + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montmul_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montmul_p256.S new file mode 100644 index 00000000000..d02aa3c6b62 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montmul_p256.S @@ -0,0 +1,462 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_256 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_p256 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_256, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_256 (in particular this is true if we are in +// the "usual" case x < p_256 and y < p_256). +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- + +// bignum_montmul_p256 is functionally equivalent to +// unopt/bignum_montmul_p256_base. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montmul_p256_base are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// ldp x7, x13, [x1] +// ldr q16, [x1] +// ldp x9, x15, [x1, #16] +// ldp x14, x4, [x2] +// ldr q19, [x2] +// ldp x12, x16, [x2, #16] +// ldr q29, [x1, #16] +// ldr q30, [x2, #16] +// uzp1 v17.4S, v19.4S, v16.4S +// rev64 v18.4S, v19.4S +// uzp1 v28.4S, v16.4S, v16.4S +// mul v24.4S, v18.4S, v16.4S +// uaddlp v18.2D, v24.4S +// shl v16.2D, v18.2D, #32 +// umlal v16.2D, v28.2S, v17.2S +// mov x2, v16.d[0] +// mov x1, v16.d[1] +// umulh x5, x7, x14 +// adds x17, x2, x1 +// umulh x3, x13, x4 +// adcs x8, x5, x3 +// adcs x10, x3, xzr +// adds x5, x5, x17 +// adcs x1, x1, x8 +// adcs x8, x10, xzr +// subs x17, x7, x13 +// cneg x3, x17, cc +// csetm x11, cc +// subs x10, x4, x14 +// cneg x6, x10, cc +// mul x17, x3, x6 +// umulh x6, x3, x6 +// cinv x11, x11, cc +// eor x17, x17, x11 +// eor x3, x6, x11 +// cmn x11, #0x1 +// adcs x5, x5, x17 +// adcs x10, x1, x3 +// adc x1, x8, x11 +// lsl x3, x2, #32 +// subs x17, x2, x3 +// lsr x11, x2, #32 +// sbc x8, x2, x11 +// adds x2, x5, x3 +// adcs x6, x10, x11 +// adcs x3, x1, x17 +// adc x10, x8, xzr +// lsl x5, x2, #32 +// subs x17, x2, x5 +// lsr x11, x2, #32 +// sbc x8, x2, x11 +// adds x2, x6, x5 +// adcs x6, x3, x11 +// adcs x1, x10, x17 +// adc x17, x8, xzr +// stp x2, x6, [x0] // @slothy:writes=buffer0 +// stp x1, x17, [x0, #16] // @slothy:writes=buffer16 +// movi v28.2D, #0x00000000ffffffff +// uzp2 v22.4S, v30.4S, v30.4S +// xtn v4.2S, v29.2D +// xtn v27.2S, v30.2D +// rev64 v23.4S, v30.4S +// umull v17.2D, v4.2S, v27.2S +// umull v7.2D, v4.2S, v22.2S +// uzp2 v16.4S, v29.4S, v29.4S +// mul v29.4S, v23.4S, v29.4S +// usra v7.2D, v17.2D, #32 +// umull v30.2D, v16.2S, v22.2S +// uaddlp v20.2D, v29.4S +// and v18.16B, v7.16B, v28.16B +// umlal v18.2D, v16.2S, v27.2S +// shl v16.2D, v20.2D, #32 +// usra v30.2D, v7.2D, #32 +// umlal v16.2D, v4.2S, v27.2S +// usra v30.2D, v18.2D, #32 +// mov x11, v16.d[0] +// mov x5, v16.d[1] +// mov x2, v30.d[0] +// adds x3, x11, x5 +// mov x17, v30.d[1] +// adcs x8, x2, x17 +// adcs x1, x17, xzr +// adds x17, x2, x3 +// adcs x8, x5, x8 +// adcs x1, x1, xzr +// subs x2, x9, x15 +// cneg x6, x2, cc +// csetm x3, cc +// subs x2, x16, x12 +// cneg x5, x2, cc +// mul x10, x6, x5 +// umulh x5, x6, x5 +// cinv x3, x3, cc +// eor x10, x10, x3 +// eor x6, x5, x3 +// cmn x3, #0x1 +// adcs x2, x17, x10 +// adcs x6, x8, x6 +// adc x5, x1, x3 +// subs x7, x9, x7 +// sbcs x3, x15, x13 +// ngc x17, xzr +// cmn x17, #0x1 +// eor x8, x7, x17 +// adcs x13, x8, xzr +// eor x15, x3, x17 +// adcs x1, x15, xzr +// subs x9, x14, x12 +// sbcs x14, x4, x16 +// ngc x3, xzr +// cmn x3, #0x1 +// eor x12, x9, x3 +// adcs x7, x12, xzr +// eor x12, x14, x3 +// adcs x12, x12, xzr +// eor x10, x17, x3 +// ldp x4, x15, [x0] // @slothy:reads=buffer0 +// adds x17, x11, x4 +// adcs x16, x2, x15 +// ldp x3, x15, [x0, #16] // @slothy:reads=buffer16 +// adcs x11, x6, x3 +// adcs x9, x5, x15 +// adc x14, xzr, xzr +// mul x6, x13, x7 +// mul x8, x1, x12 +// umulh x5, x13, x7 +// adds x3, x6, x8 +// umulh x2, x1, x12 +// adcs x4, x5, x2 +// adcs x15, x2, xzr +// adds x3, x5, x3 +// adcs x4, x8, x4 +// adcs x15, x15, xzr +// subs x1, x13, x1 +// cneg x8, x1, cc +// csetm x5, cc +// subs x1, x12, x7 +// cneg x2, x1, cc +// mul x7, x8, x2 +// umulh x2, x8, x2 +// cinv x13, x5, cc +// eor x7, x7, x13 +// eor x2, x2, x13 +// cmn x13, #0x1 +// adcs x3, x3, x7 +// adcs x4, x4, x2 +// adc x5, x15, x13 +// cmn x10, #0x1 +// eor x8, x6, x10 +// adcs x15, x8, x17 +// eor x2, x3, x10 +// adcs x2, x2, x16 +// eor x6, x4, x10 +// adcs x3, x6, x11 +// eor x7, x5, x10 +// adcs x1, x7, x9 +// adcs x13, x14, x10 +// adcs x12, x10, xzr +// adc x10, x10, xzr +// adds x5, x3, x17 +// adcs x8, x1, x16 +// adcs x13, x13, x11 +// adcs x6, x12, x9 +// adc x4, x10, x14 +// lsl x9, x15, #32 +// subs x7, x15, x9 +// lsr x1, x15, #32 +// sbc x14, x15, x1 +// adds x10, x2, x9 +// adcs x15, x5, x1 +// adcs x5, x8, x7 +// adc x7, x14, xzr +// lsl x12, x10, #32 +// subs x17, x10, x12 +// lsr x9, x10, #32 +// sbc x3, x10, x9 +// adds x12, x15, x12 +// adcs x5, x5, x9 +// adcs x14, x7, x17 +// adc x2, x3, xzr +// adds x14, x13, x14 +// adcs x6, x6, x2 +// adc x17, x4, xzr +// add x7, x17, #0x1 +// lsl x16, x7, #32 +// adds x3, x6, x16 +// adc x1, x17, xzr +// neg x15, x7 +// sub x13, x16, #0x1 +// subs x9, x12, x15 +// sbcs x8, x5, x13 +// sbcs x15, x14, xzr +// sbcs x3, x3, x7 +// sbcs x7, x1, x7 +// adds x4, x9, x7 +// mov x6, #0xffffffff +// and x17, x6, x7 +// adcs x8, x8, x17 +// adcs x5, x15, xzr +// mov x10, #0xffffffff00000001 +// and x1, x10, x7 +// adc x12, x3, x1 +// stp x4, x8, [x0] // @slothy:writes=buffer0 +// stp x5, x12, [x0, #16] // @slothy:writes=buffer16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret' as, say, 'input.S' +// export OUTPUTS="[hint_buffer0,hint_buffer16]" +// export RESERVED_REGS="[x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_montmul_p256): + + ldr q20, [x2] + ldp x7, x17, [x1] + ldr q0, [x1] + ldp x6, x10, [x2] + ldp x11, x15, [x1, #16] + rev64 v16.4S, v20.4S + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4S, v16.4S, v0.4S + umulh x12, x17, x10 + uzp1 v28.4S, v20.4S, v0.4S + subs x14, x11, x7 + ldr q20, [x2, #16] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2D, v16.4S + umulh x4, x7, x6 + uzp1 v21.4S, v0.4S, v0.4S + cneg x11, x8, cc + shl v17.2D, v27.2D, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2D, v21.2S, v28.2S + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x1, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2S, v20.2D + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4S, v20.4S, v20.4S + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2S, v28.2D + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x2, #16] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x9, x3, x13 + adcs x3, x8, x7 + umulh x8, x14, x11 + umull v21.2D, v0.2S, v1.2S + adcs x12, x10, x12 + umull v3.2D, v0.2S, v16.2S + adc x15, x15, xzr + rev64 v24.4S, v20.4S + stp x12, x15, [x0, #16] + movi v2.2D, #0x00000000ffffffff + mul x10, x14, x11 + mul v4.4S, v24.4S, v28.4S + subs x13, x14, x5 + uzp2 v19.4S, v28.4S, v28.4S + csetm x15, cc + usra v3.2D, v21.2D, #32 + mul x7, x5, x1 + umull v21.2D, v19.2S, v16.2S + cneg x13, x13, cc + uaddlp v5.2D, v4.4S + subs x11, x1, x11 + and v16.16B, v3.16B, v2.16B + umulh x5, x5, x1 + shl v24.2D, v5.2D, #32 + cneg x11, x11, cc + umlal v16.2D, v19.2S, v1.2S + cinv x12, x15, cc + umlal v24.2D, v0.2S, v1.2S + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + stp x9, x3, [x0] + usra v21.2D, v3.2D, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2D, v16.2D, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + ldp x15, x8, [x0] + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + ldp x9, x13, [x0, #16] + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x15 + adcs x15, x16, x8 + eor x5, x17, x4 + adcs x9, x1, x9 + eor x1, x10, x5 + adcs x16, x2, x13 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [x0] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [x0, #16] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montmul_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montmul_p256_alt.S new file mode 100644 index 00000000000..98a396eec99 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montmul_p256_alt.S @@ -0,0 +1,205 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_256 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_p256_alt +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_256, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_256 (in particular this is true if we are in +// the "usual" case x < p_256 and y < p_256). +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256_alt) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1] and generating d4 from zero, re-using +// d0 as a temporary internally together with tmp. The "mc" parameter is +// assumed to be a register whose value is 0xFFFFFFFF00000001. +// It is fine for d4 to be the same register as d0, and it often is. +// --------------------------------------------------------------------------- + +#define montreds(d4,d3,d2,d1,d0, tmp,mc) \ + adds d1, d1, d0, lsl #32 __LF \ + lsr tmp, d0, #32 __LF \ + adcs d2, d2, tmp __LF \ + mul tmp, d0, mc __LF \ + umulh d4, d0, mc __LF \ + adcs d3, d3, tmp __LF \ + adc d4, d4, xzr + +#define z x0 +#define x x1 +#define y x2 + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 +#define b0 x7 +#define b1 x8 +#define b2 x9 +#define b3 x10 + +#define l x11 + +#define u0 x12 +#define u1 x13 +#define u2 x14 +#define u3 x15 +#define u4 x16 + +// These alias to the input arguments when no longer needed + +#define u5 a0 +#define u6 a1 +#define u7 a2 +#define h a3 +#define mc b3 + +S2N_BN_SYMBOL(bignum_montmul_p256_alt): + +// Load operands and set up row 0 = [u4;...;u0] = a0 * [b3;...;b0] + + ldp a0, a1, [x] + ldp b0, b1, [y] + + mul u0, a0, b0 + umulh u1, a0, b0 + mul l, a0, b1 + umulh u2, a0, b1 + adds u1, u1, l + + ldp b2, b3, [y, #16] + + mul l, a0, b2 + umulh u3, a0, b2 + adcs u2, u2, l + + mul l, a0, b3 + umulh u4, a0, b3 + adcs u3, u3, l + adc u4, u4, xzr + + ldp a2, a3, [x, #16] + +// Row 1 = [u5;...;u0] = [a1;a0] * [b3;...;b0] + + mul l, a1, b0 + adds u1, u1, l + mul l, a1, b1 + adcs u2, u2, l + mul l, a1, b2 + adcs u3, u3, l + mul l, a1, b3 + adcs u4, u4, l + umulh u5, a1, b3 + adc u5, u5, xzr + + umulh l, a1, b0 + adds u2, u2, l + umulh l, a1, b1 + adcs u3, u3, l + umulh l, a1, b2 + adcs u4, u4, l + adc u5, u5, xzr + +// Row 2 = [u6;...;u0] = [a2;a1;a0] * [b3;...;b0] + + mul l, a2, b0 + adds u2, u2, l + mul l, a2, b1 + adcs u3, u3, l + mul l, a2, b2 + adcs u4, u4, l + mul l, a2, b3 + adcs u5, u5, l + umulh u6, a2, b3 + adc u6, u6, xzr + + umulh l, a2, b0 + adds u3, u3, l + umulh l, a2, b1 + adcs u4, u4, l + umulh l, a2, b2 + adcs u5, u5, l + adc u6, u6, xzr + +// Row 3 = [u7;...;u0] = [a3;...a0] * [b3;...;b0] +// Interleave the first Montgomery rotation of the low half + + mul l, a3, b0 + adds u3, u3, l + mul l, a3, b1 + adcs u4, u4, l + mul l, a3, b2 + adcs u5, u5, l + mul l, a3, b3 + adcs u6, u6, l + umulh u7, a3, b3 + adc u7, u7, xzr + + mov mc, 0xFFFFFFFF00000001 + montreds(u0,u3,u2,u1,u0, l,mc) + + umulh l, a3, b0 + adds u4, u4, l + umulh l, a3, b1 + adcs u5, u5, l + umulh l, a3, b2 + adcs u6, u6, l + adc u7, u7, xzr + +// Perform 3 further Montgomery steps to rotate the lower half + + montreds(u1,u0,u3,u2,u1, l,mc) + montreds(u2,u1,u0,u3,u2, l,mc) + montreds(u3,u2,u1,u0,u3, l,mc) + +// Add high and low parts, catching carry in b1 + + adds u0, u0, u4 + adcs u1, u1, u5 + adcs u2, u2, u6 + adcs u3, u3, u7 + cset b1, cs + +// Set [mc;0;l;-1] = p_256 and form [u7,u6,u5,u4] = [b1;u3;u2;u1;u0] - p_256 + + mov l, #0x00000000ffffffff + + subs u4, u0, #-1 + sbcs u5, u1, l + sbcs u6, u2, xzr + sbcs u7, u3, mc + sbcs xzr, b1, xzr + +// Now CF is clear if the comparison carried so the original was fine +// Otherwise take the form with p_256 subtracted. + + csel u0, u0, u4, cc + csel u1, u1, u5, cc + csel u2, u2, u6, cc + csel u3, u3, u7, cc + +// Store back final result + + stp u0, u1, [z] + stp u2, u3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montsqr_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montsqr_p256.S new file mode 100644 index 00000000000..c23bebc57c1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montsqr_p256.S @@ -0,0 +1,325 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_256, assuming x^2 <= 2^256 * p_256, which is +// guaranteed in particular if x < p_256 initially (the "intended" case). +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- + +// bignum_montsqr_p256 is functionally equivalent to +// unopt/bignum_montsqr_p256_base. +// It is written in a way that +// 1. A subset of scalar multiplications in bignum_montsqr_p256_base are carefully +// chosen and vectorized +// 2. The vectorized assembly is rescheduled using the SLOTHY superoptimizer. +// https://github.com/slothy-optimizer/slothy +// +// The output program of step 1. is as follows: +// +// ldp x7, x3, [x1] +// ldr q6, [x1] +// ldp x9, x8, [x1, #16] +// ldr q18, [x1, #16] +// ldr q27, [x1] +// umull v16.2D, v27.2S, v27.2S +// umull2 v17.2D, v27.4S, v27.4S +// xtn v30.2S, v27.2D +// uzp2 v27.4S, v27.4S, v27.4S +// umull v27.2D, v27.2S, v30.2S +// mov x6, v16.d[0] +// mov x12, v16.d[1] +// mov x13, v17.d[0] +// mov x1, v17.d[1] +// mov x15, v27.d[0] +// mov x10, v27.d[1] +// adds x4, x6, x15, lsl #33 +// lsr x6, x15, #31 +// adc x15, x12, x6 +// adds x13, x13, x10, lsl #33 +// lsr x6, x10, #31 +// adc x12, x1, x6 +// mul x6, x7, x3 +// umulh x1, x7, x3 +// adds x5, x15, x6, lsl #1 +// extr x6, x1, x6, #63 +// adcs x10, x13, x6 +// lsr x6, x1, #63 +// adc x15, x12, x6 +// lsl x6, x4, #32 +// subs x13, x4, x6 +// lsr x12, x4, #32 +// sbc x1, x4, x12 +// adds x6, x5, x6 +// adcs x5, x10, x12 +// adcs x10, x15, x13 +// adc x15, x1, xzr +// lsl x13, x6, #32 +// subs x12, x6, x13 +// lsr x1, x6, #32 +// sbc x6, x6, x1 +// adds x16, x5, x13 +// adcs x11, x10, x1 +// adcs x2, x15, x12 +// adc x17, x6, xzr +// uzp1 v30.4S, v18.4S, v6.4S +// rev64 v27.4S, v18.4S +// uzp1 v18.4S, v6.4S, v6.4S +// mul v27.4S, v27.4S, v6.4S +// uaddlp v5.2D, v27.4S +// shl v6.2D, v5.2D, #32 +// umlal v6.2D, v18.2S, v30.2S +// mov x4, v6.d[0] +// mov x5, v6.d[1] +// umulh x10, x7, x9 +// subs x6, x7, x3 +// cneg x13, x6, cc +// csetm x12, cc +// subs x6, x8, x9 +// cneg x6, x6, cc +// mul x1, x13, x6 +// umulh x6, x13, x6 +// cinv x15, x12, cc +// eor x12, x1, x15 +// eor x13, x6, x15 +// adds x1, x4, x10 +// adc x6, x10, xzr +// umulh x3, x3, x8 +// adds x1, x1, x5 +// adcs x6, x6, x3 +// adc x3, x3, xzr +// adds x6, x6, x5 +// adc x3, x3, xzr +// cmn x15, #0x1 +// adcs x12, x1, x12 +// adcs x1, x6, x13 +// adc x3, x3, x15 +// adds x6, x4, x4 +// adcs x13, x12, x12 +// adcs x12, x1, x1 +// adcs x1, x3, x3 +// adc x3, xzr, xzr +// adds x6, x6, x16 +// adcs x5, x13, x11 +// adcs x10, x12, x2 +// adcs x15, x1, x17 +// adc x13, x3, xzr +// lsl x3, x6, #32 +// subs x12, x6, x3 +// lsr x1, x6, #32 +// sbc x6, x6, x1 +// adds x3, x5, x3 +// adcs x5, x10, x1 +// adcs x15, x15, x12 +// adcs x13, x13, x6 +// adc x10, xzr, xzr +// lsl x6, x3, #32 +// subs x12, x3, x6 +// lsr x1, x3, #32 +// sbc x3, x3, x1 +// adds x6, x5, x6 +// adcs x15, x15, x1 +// adcs x13, x13, x12 +// adcs x12, x10, x3 +// adc x1, xzr, xzr +// mul x3, x9, x9 +// adds x5, x6, x3 +// mul x6, x8, x8 +// umulh x3, x9, x9 +// adcs x15, x15, x3 +// adcs x13, x13, x6 +// umulh x3, x8, x8 +// adcs x12, x12, x3 +// adc x1, x1, xzr +// mul x6, x9, x8 +// umulh x3, x9, x8 +// adds x8, x6, x6 +// adcs x9, x3, x3 +// adc x3, xzr, xzr +// adds x10, x15, x8 +// adcs x15, x13, x9 +// adcs x13, x12, x3 +// adcs x12, x1, xzr +// mov x3, #0xffffffff +// adds x6, x5, #0x1 +// sbcs x8, x10, x3 +// mov x3, #0xffffffff00000001 +// sbcs x9, x15, xzr +// sbcs x1, x13, x3 +// sbcs xzr, x12, xzr +// csel x6, x6, x5, cs +// csel x8, x8, x10, cs +// csel x9, x9, x15, cs +// csel x3, x1, x13, cs +// stp x6, x8, [x0] // @slothy:writes=buffer0 +// stp x9, x3, [x0, #16] // @slothy:writes=buffer16 +// ret +// +// The bash script used for step 2 is as follows: +// +// # Store the assembly instructions except the last 'ret' as, say, 'input.S' +// export OUTPUTS="[hint_buffer0,hint_buffer16]" +// export RESERVED_REGS="[x18,x19,x20,x21,x22,x23,x24,x25,x26,x27,x28,x29,x30,sp,q8,q9,q10,q11,q12,q13,q14,q15,v8,v9,v10,v11,v12,v13,v14,v15]" +// /tools/external/slothy.sh input.S my_out_dir +// # my_out_dir/3.opt.s is the optimized assembly. Its output may differ +// # from this file since the sequence is non-deterministically chosen. +// # Please add 'ret' at the end of the output assembly. + + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256) + .text + .balign 4 + +S2N_BN_SYMBOL(bignum_montsqr_p256): + + ldr q19, [x1] + ldp x9, x13, [x1] + ldr q23, [x1, #16] + ldr q0, [x1] + ldp x1, x10, [x1, #16] + uzp2 v29.4S, v19.4S, v19.4S + xtn v4.2S, v19.2D + umulh x8, x9, x13 + rev64 v20.4S, v23.4S + umull v16.2D, v19.2S, v19.2S + umull v1.2D, v29.2S, v4.2S + mul v20.4S, v20.4S, v0.4S + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2D, v19.4S, v19.4S + mov x4, v16.d[0] + uzp1 v17.4S, v23.4S, v0.4S + uaddlp v19.2D, v20.4S + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4S, v0.4S, v0.4S + shl v19.2D, v19.2D, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2D, v20.2S, v17.2S + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x2, x5, x2, cs + stp x14, x12, [x0, #16] + stp x16, x2, [x0] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montsqr_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montsqr_p256_alt.S new file mode 100644 index 00000000000..cecc797a5bc --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_montsqr_p256_alt.S @@ -0,0 +1,183 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_p256_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_256, assuming x^2 <= 2^256 * p_256, which is +// guaranteed in particular if x < p_256 initially (the "intended" case). +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256_alt) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1] and generating d4 from zero, re-using +// d0 as a temporary internally together with "tmp". The "mc" parameter is +// assumed to be a register whose value is 0xFFFFFFFF00000001. +// It is fine for d4 to be the same register as d0, and it often is. +// --------------------------------------------------------------------------- + +#define montreds(d4,d3,d2,d1,d0, tmp,mc) \ + adds d1, d1, d0, lsl #32 __LF \ + lsr tmp, d0, #32 __LF \ + adcs d2, d2, tmp __LF \ + mul tmp, d0, mc __LF \ + umulh d4, d0, mc __LF \ + adcs d3, d3, tmp __LF \ + adc d4, d4, xzr + +#define z x0 +#define x x1 + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 + +#define l x6 +#define h x7 + +#define u0 x8 +#define u1 x9 +#define u2 x10 +#define u3 x11 +#define u4 x12 +#define u5 x13 +#define u6 x14 + +// This one is the same as h, which is safe with this computation sequence + +#define u7 h + +// This one is the same as a3, and is used for the Montgomery constant +// 0xFFFFFFFF00000001 + +#define mc x5 + +S2N_BN_SYMBOL(bignum_montsqr_p256_alt): + +// Load all the elements, set up an initial window [u6;...u1] = [23;03;01] +// and chain in the addition of 02 + 12 + 13 (no carry-out is possible). +// This gives all the "heterogeneous" terms of the squaring ready to double + + ldp a0, a1, [x] + + mul u1, a0, a1 + umulh u2, a0, a1 + + ldp a2, a3, [x, #16] + + mul u3, a0, a3 + umulh u4, a0, a3 + + mul l, a0, a2 + umulh h, a0, a2 + adds u2, u2, l + + adcs u3, u3, h + mul l, a1, a2 + umulh h, a1, a2 + adc h, h, xzr + adds u3, u3, l + + mul u5, a2, a3 + umulh u6, a2, a3 + + adcs u4, u4, h + mul l, a1, a3 + umulh h, a1, a3 + adc h, h, xzr + adds u4, u4, l + + adcs u5, u5, h + adc u6, u6, xzr + +// Now just double it; this simple approach seems to work better than extr + + adds u1, u1, u1 + adcs u2, u2, u2 + adcs u3, u3, u3 + adcs u4, u4, u4 + adcs u5, u5, u5 + adcs u6, u6, u6 + cset u7, cs + +// Add the homogeneous terms 00 + 11 + 22 + 33 + + umulh l, a0, a0 + mul u0, a0, a0 + adds u1, u1, l + + mul l, a1, a1 + adcs u2, u2, l + umulh l, a1, a1 + adcs u3, u3, l + + + mul l, a2, a2 + adcs u4, u4, l + umulh l, a2, a2 + adcs u5, u5, l + + mul l, a3, a3 + adcs u6, u6, l + umulh l, a3, a3 + adc u7, u7, l + +// Squaring complete. Perform 4 Montgomery steps to rotate the lower half + + mov mc, #0xFFFFFFFF00000001 + montreds(u0,u3,u2,u1,u0, a0,mc) + montreds(u1,u0,u3,u2,u1, a0,mc) + montreds(u2,u1,u0,u3,u2, a0,mc) + montreds(u3,u2,u1,u0,u3, a0,mc) + +// Add high and low parts, catching carry in a0 + + adds u0, u0, u4 + adcs u1, u1, u5 + adcs u2, u2, u6 + adcs u3, u3, u7 + cset a0, cs + +// Set [a3;0;a1;-1] = p_256 and form [u7,u6,u5,u4] = [a0;u3;u2;u1;u0] - p_256 +// Note that a3 == mc was already set above + + mov a1, #0x00000000ffffffff + + subs u4, u0, #-1 + sbcs u5, u1, a1 + sbcs u6, u2, xzr + sbcs u7, u3, mc + sbcs xzr, a0, xzr + +// Now CF is clear if the comparison carried so the original was fine +// Otherwise take the form with p_256 subtracted. + + csel u0, u0, u4, cc + csel u1, u1, u5, cc + csel u2, u2, u6, cc + csel u3, u3, u7, cc + +// Store back final result + + stp u0, u1, [z] + stp u2, u3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mux_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mux_4.S new file mode 100644 index 00000000000..e678e652693 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_mux_4.S @@ -0,0 +1,58 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// 256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) +// Inputs p, x[4], y[4]; output z[4] +// +// extern void bignum_mux_4 +// (uint64_t p, uint64_t z[static 4], +// uint64_t x[static 4], uint64_t y[static 4]); +// +// It is assumed that all numbers x, y and z have the same size 4 digits. +// +// Standard ARM ABI: X0 = p, X1 = z, X2 = x, X3 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mux_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mux_4) + .text + .balign 4 + +#define p x0 +#define z x1 +#define x x2 +#define y x3 +#define a x4 + + +S2N_BN_SYMBOL(bignum_mux_4): + +cmp p, #0 // Set condition codes p = 0 + + ldr a, [x] + ldr p, [y] + csel a, a, p, ne + str a, [z] + + ldr a, [x, #8] + ldr p, [y, #8] + csel a, a, p, ne + str a, [z, #8] + + ldr a, [x, #16] + ldr p, [y, #16] + csel a, a, p, ne + str a, [z, #16] + + ldr a, [x, #24] + ldr p, [y, #24] + csel a, a, p, ne + str a, [z, #24] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_neg_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_neg_p256.S new file mode 100644 index 00000000000..7007362fc85 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_neg_p256.S @@ -0,0 +1,67 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Negate modulo p_256, z := (-x) mod p_256, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_neg_p256 (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_neg_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_neg_p256) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define p x2 +#define t x3 + +#define d0 x4 +#define d1 x5 +#define d2 x6 +#define d3 x7 + + +S2N_BN_SYMBOL(bignum_neg_p256): + +// Load the 4 digits of x + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Set a bitmask p for the input being nonzero, so that we avoid doing +// -0 = p_256 and hence maintain strict modular reduction + + orr t, d0, d1 + orr p, d2, d3 + orr p, p, t + cmp p, #0 + csetm p, ne + +// Mask the nontrivial words of p_256 = [n3;0;n1;-1] and subtract + + subs d0, p, d0 + and t, p, #0x00000000ffffffff + sbcs d1, t, d1 + sbcs d2, xzr, d2 + and t, p, #0xffffffff00000001 + sbc d3, t, d3 + +// Write back the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + +// Return + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_nonzero_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_nonzero_4.S new file mode 100644 index 00000000000..3d6f8b36539 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_nonzero_4.S @@ -0,0 +1,44 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// 256-bit nonzeroness test, returning 1 if x is nonzero, 0 if x is zero +// Input x[4]; output function return +// +// extern uint64_t bignum_nonzero_4(uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = x, returns X0 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_nonzero_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_nonzero_4) + .text + .balign 4 + +#define x x0 +#define a x1 +#define d x2 +#define c x3 + + +S2N_BN_SYMBOL(bignum_nonzero_4): + +// Generate a = an OR of all the words in the bignum + + ldp a, d, [x] + orr a, a, d + ldp c, d, [x, #16] + orr c, c, d + orr a, a, c + +// Set a standard C condition based on whether a is nonzero + + cmp a, xzr + cset x0, ne + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_optneg_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_optneg_p256.S new file mode 100644 index 00000000000..2f9260d5871 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_optneg_p256.S @@ -0,0 +1,84 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or +// z := x (if p zero), assuming x reduced +// Inputs p, x[4]; output z[4] +// +// extern void bignum_optneg_p256 +// (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = p, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_p256) + .text + .balign 4 + +#define z x0 +#define p x1 +#define x x2 + +#define d0 x3 +#define d1 x4 +#define d2 x5 +#define d3 x6 +#define n0 x7 +#define n1 x8 +#define n2 x9 +#define n3 x10 + + +S2N_BN_SYMBOL(bignum_optneg_p256): + +// Load the 4 digits of x + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Adjust p by zeroing it if the input is zero (to avoid giving -0 = p, which +// is not strictly reduced even though it's correct modulo p) + + orr n0, d0, d1 + orr n1, d2, d3 + orr n2, n0, n1 + cmp n2, #0 + csel p, xzr, p, eq + +// Load the three nonzero words of p_256 = [n3;0;n1;n0] + + mov n0, #0xffffffffffffffff + mov n1, #0x00000000ffffffff + mov n3, #0xffffffff00000001 + +// Do the subtraction, which by hypothesis does not underflow + + subs n0, n0, d0 + sbcs n1, n1, d1 + sbcs n2, xzr, d2 + sbc n3, n3, d3 + +// Set condition code if original x is nonzero and p was nonzero + + cmp p, #0 + +// Hence multiplex and write back + + csel n0, n0, d0, ne + csel n1, n1, d1, ne + csel n2, n2, d2, ne + csel n3, n3, d3, ne + + stp n0, n1, [z] + stp n2, n3, [z, #16] + +// Return + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_sub_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_sub_p256.S new file mode 100644 index 00000000000..5e71ca0892c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_sub_p256.S @@ -0,0 +1,67 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Subtract modulo p_256, z := (x - y) mod p_256 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_sub_p256 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_p256) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 +#define c x3 +#define l x4 +#define d0 x5 +#define d1 x6 +#define d2 x7 +#define d3 x8 + + +S2N_BN_SYMBOL(bignum_sub_p256): + +// First just subtract the numbers as [d3; d2; d1; d0] +// Set a mask based on (inverted) carry indicating x < y = correction is needed + + ldp d0, d1, [x] + ldp l, c, [y] + subs d0, d0, l + sbcs d1, d1, c + ldp d2, d3, [x, #16] + ldp l, c, [y, #16] + sbcs d2, d2, l + sbcs d3, d3, c + +// Create a mask for the condition x < y, when we need to correct + + csetm c, cc + +// Now correct by adding masked p_256 + + adds d0, d0, c + and l, c, #0x00000000ffffffff + adcs d1, d1, l + adcs d2, d2, xzr + and l, c, #0xffffffff00000001 + adc d3, d3, l + +// Store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_tomont_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_tomont_p256.S new file mode 100644 index 00000000000..37574f3087d --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_tomont_p256.S @@ -0,0 +1,116 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert to Montgomery form z := (2^256 * x) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_tomont_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256_alt) + .text + .balign 4 + +// ---------------------------------------------------------------------------- +// Core "x |-> (2^64 * x) mod p_256" macro, with x assumed to be < p_256. +// Input is in [d4;d3;d2;d1] and output in [d3;d2;d1;d0] +// using d4 as well as t1, t2, t3 as temporaries. +// ---------------------------------------------------------------------------- + +#define modstep_p256(d4, d3,d2,d1,d0, t1,t2,t3) \ +/* Writing the input as z = 2^256 * h + 2^192 * l + t = 2^192 * hl + t, */ \ +/* our quotient approximation is MIN ((hl + hl>>32 + 1)>>64) (2^64 - 1). */ \ + subs xzr, xzr, xzr __LF/* Set carry flag for +1 */ \ + extr t3, d4, d3, #32 __LF \ + adcs xzr, d3, t3 __LF \ + lsr t3, d4, #32 __LF \ + adcs t3, d4, t3 __LF \ + csetm d0, cs __LF \ + orr t3, t3, d0 __LF \ +/* First do [t2;t1] = 2^32 * q, which we use twice */ \ + lsl t1, t3, #32 __LF \ + lsr t2, t3, #32 __LF \ +/* Add 2^224 * q to sum */ \ + adds d3, d3, t1 __LF \ + adc d4, d4, t2 __LF \ +/* Accumulate [t2;t1;d0] = (2^96 - 1) * q */ \ + subs d0, xzr, t3 __LF \ + sbcs t1, t1, xzr __LF \ + sbc t2, t2, xzr __LF \ +/* Subtract (2^256 + 2^192 + 2^96 - 1) * q */ \ + subs d0, xzr, d0 __LF \ + sbcs d1, d1, t1 __LF \ + sbcs d2, d2, t2 __LF \ + sbcs d3, d3, t3 __LF \ + sbcs d4, d4, t3 __LF \ +/* Use top word as mask to correct */ \ + adds d0, d0, d4 __LF \ + mov t1, #0x00000000ffffffff __LF \ + and t1, t1, d4 __LF \ + adcs d1, d1, t1 __LF \ + adcs d2, d2, xzr __LF \ + mov t1, #0xffffffff00000001 __LF \ + and t1, t1, d4 __LF \ + adc d3, d3, t1 + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define d4 x6 + +#define t0 x1 +#define t1 x7 +#define t2 x8 +#define t3 x9 + +S2N_BN_SYMBOL(bignum_tomont_p256): + +S2N_BN_SYMBOL(bignum_tomont_p256_alt): + +// Load the input + + ldp d0, d1, [x1] + ldp d2, d3, [x1, #16] + +// Do an initial reduction to make sure this is < p_256, using just +// a copy of the bignum_mod_p256_4 code. This is needed to set up the +// invariant "input < p_256" for the main modular reduction steps. + + mov t0, #0xffffffffffffffff + mov t1, #0x00000000ffffffff + mov t3, #0xffffffff00000001 + subs t0, d0, t0 + sbcs t1, d1, t1 + sbcs t2, d2, xzr + sbcs t3, d3, t3 + csel d0, d0, t0, cc + csel d1, d1, t1, cc + csel d2, d2, t2, cc + csel d3, d3, t3, cc + +// Successively multiply by 2^64 and reduce + + modstep_p256(d3,d2,d1,d0,d4, t1,t2,t3) + modstep_p256(d2,d1,d0,d4,d3, t1,t2,t3) + modstep_p256(d1,d0,d4,d3,d2, t1,t2,t3) + modstep_p256(d0,d4,d3,d2,d1, t1,t2,t3) + +// Store the result and return + + stp d1, d2, [x0] + stp d3, d4, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_triple_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_triple_p256.S new file mode 100644 index 00000000000..ad2ee2c223b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/bignum_triple_p256.S @@ -0,0 +1,112 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Triple modulo p_256, z := (3 * x) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_triple_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The input x can be any 4-digit bignum, not necessarily reduced modulo p_256, +// and the result is always fully reduced, i.e. z = (3 * x) mod p_256. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256_alt) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define h x6 + +// Slightly offset aliases for the d_i for readability. + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 + +// More aliases for the same thing at different stages + +#define q x7 +#define c x7 + +// Other temporary variables + +#define t0 x8 +#define t1 x9 + + +S2N_BN_SYMBOL(bignum_triple_p256): + +S2N_BN_SYMBOL(bignum_triple_p256_alt): + +// Load the inputs + + ldp a0, a1, [x] + ldp a2, a3, [x, #16] + +// First do the multiplication by 3, getting z = [h; d3; ...; d0] + + lsl d0, a0, #1 + adds d0, d0, a0 + extr d1, a1, a0, #63 + adcs d1, d1, a1 + extr d2, a2, a1, #63 + adcs d2, d2, a2 + extr d3, a3, a2, #63 + adcs d3, d3, a3 + lsr h, a3, #63 + adc h, h, xzr + +// For this limited range a simple quotient estimate of q = h + 1 works, where +// h = floor(z / 2^256). Then -p_256 <= z - q * p_256 < p_256, so we just need +// to subtract q * p_256 and then if that's negative, add back p_256. + + add q, h, #1 + +// Initial subtraction of z - q * p_256, with bitmask c for the carry + + lsl t1, q, #32 + adds d3, d3, t1 + adc h, h, xzr + sub t0, xzr, q + sub t1, t1, #1 + subs d0, d0, t0 + sbcs d1, d1, t1 + sbcs d2, d2, xzr + sbcs d3, d3, q + sbc c, h, q + +// Use the bitmask c for final masked addition of p_256. + + adds d0, d0, c + mov t0, #0x00000000ffffffff + and t0, t0, c + adcs d1, d1, t0 + adcs d2, d2, xzr + neg t1, t0 + adc d3, d3, t1 + +// Finally store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjadd.S new file mode 100644 index 00000000000..548da6eb568 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjadd.S @@ -0,0 +1,3160 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// This is functionally equivalent to p256_montjadd in unopt/p256_montjadd.S. +// This is the result of doing the following sequence of optimizations: +// 1. Function inlining +// 2. Eliminating redundant load/store instructions +// 3. Folding (add addr, const) + load/store +// Function inlining is done manually. The second and third optimizations are +// done by a script. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjadd) + + .text + .balign 4 + +#define NUMSIZE 32 +#define NSPACE (NUMSIZE*7) + +S2N_BN_SYMBOL(p256_montjadd): + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x30, [sp, #-16]! + sub sp, sp, NSPACE + + mov x21, x0 + mov x22, x1 + mov x23, x2 + mov x0, sp + ldr q19, [x22, #64] + ldp x9, x13, [x22, #64] + ldr q23, [x22, #80] + ldr q0, [x22, #64] + ldp x1, x10, [x22, #80] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x19, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x20, x5, x2, cs + stp x14, x12, [x0, #16] + stp x19, x20, [x0] + ldr q19, [x23, #64] + ldp x9, x13, [x23, #64] + ldr q23, [x23, #80] + ldr q0, [x23, #64] + ldp x1, x10, [x23, #80] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x2, x5, x2, cs + stp x14, x12, [sp, #176] + stp x16, x2, [sp, #160] + ldr q20, [x22, #32] + ldp x7, x17, [x23, #64] + ldr q0, [x23, #64] + ldp x6, x10, [x22, #32] + ldp x11, x15, [x23, #80] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x22, #48] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x23, #80] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x22, #48] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #192] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #208] + ldr q20, [x23, #32] + ldp x7, x17, [x22, #64] + ldr q0, [x22, #64] + ldp x6, x10, [x23, #32] + ldp x11, x15, [x22, #80] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x23, #48] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x22, #80] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x23, #48] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x24, x11, x13 + and x1, x1, x13 + adcs x25, x4, x1 + and x1, x12, x13 + stp x24, x25, [sp, #32] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #48] + mov x1, sp + ldr q20, [x23, #0] + ldr q0, [x1] + ldp x6, x10, [x23, #0] + ldp x11, x15, [x1, #16] + rev64 v16.4s, v20.4s + subs x4, x19, x20 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x20, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x19 + ldr q20, [x23, #16] + sbcs x5, x15, x20 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x19, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x1, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x23, #16] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #64] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #80] + ldr q20, [x22, #0] + ldp x7, x17, [sp, #160] + ldr q0, [sp, #160] + ldp x6, x10, [x22, #0] + ldp x11, x15, [sp, #176] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x22, #16] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x22, #16] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #128] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #144] + mov x1, sp + ldr q20, [sp, #32] + ldp x7, x17, [x1] + ldr q0, [x1] + ldp x11, x15, [x1, #16] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x25 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #48] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x24 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x25, x24 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x1, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #48] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x24, x7 + sbcs x9, x25, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x24, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x25, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x24 + eor x1, x10, x5 + adcs x16, x2, x25 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x19, x11, x13 + and x1, x1, x13 + adcs x20, x4, x1 + and x1, x12, x13 + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #48] + ldr q20, [sp, #192] + ldp x7, x17, [sp, #160] + ldr q0, [sp, #160] + ldp x6, x10, [sp, #192] + ldp x11, x15, [sp, #176] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #208] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #208] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x9, x11, x13 + and x1, x1, x13 + adcs x10, x4, x1 + and x1, x12, x13 + stp x9, x10, [sp, #192] + adcs x11, x7, xzr + adc x12, x17, x1 + stp x11, x12, [sp, #208] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x13, x5, x3 + and x4, x3, #0xffffffff + adcs x24, x6, x4 + adcs x25, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x26, x8, x4 + stp x13, x24, [sp, #160] + stp x25, x26, [sp, #176] + subs x5, x19, x9 + sbcs x6, x20, x10 + ldp x7, x8, [sp, #48] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + csetm x3, cc + adds x19, x5, x3 + and x4, x3, #0xffffffff + adcs x20, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x19, x20, [sp, #32] + stp x7, x8, [sp, #48] + ldr q19, [sp, #160] + ldr q23, [sp, #176] + ldr q0, [sp, #160] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x13, x24 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x13, x24 + umulh x15, x13, x25 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x13, x24 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x26, x25 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x24, x26 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x25, x26 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x25, x26 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x26, x26 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x26, x26 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x25, x25 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x25, x25 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x24, x3, x16, cs + csel x25, x8, x14, cs + csel x26, x11, x12, cs + csel x27, x5, x2, cs + stp x25, x26, [sp, #112] + stp x24, x27, [sp, #96] + mov x0, sp + ldr q19, [sp, #32] + ldr q23, [sp, #48] + ldr q0, [sp, #32] + ldp x1, x10, [sp, #48] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x19, x20 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x19, x20 + umulh x15, x19, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x19, x20 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x20, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x2, x5, x2, cs + stp x14, x12, [x0, #16] + stp x16, x2, [x0] + ldr q20, [sp, #128] + ldr q0, [sp, #96] + ldp x6, x10, [sp, #128] + rev64 v16.4s, v20.4s + subs x4, x24, x27 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x27, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x25, x24 + ldr q20, [sp, #144] + sbcs x5, x26, x27 + ngc x17, xzr + subs x8, x25, x26 + uaddlp v27.2d, v16.4s + umulh x4, x24, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #112] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #144] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x25, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x26, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x25 + eor x1, x10, x5 + adcs x16, x2, x26 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x19, x11, x13 + and x1, x1, x13 + adcs x20, x4, x1 + and x1, x12, x13 + stp x19, x20, [sp, #128] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #144] + ldr q20, [sp, #64] + ldr q0, [sp, #96] + ldp x6, x10, [sp, #64] + ldp x11, x15, [sp, #112] + rev64 v16.4s, v20.4s + subs x4, x24, x27 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x27, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x24 + ldr q20, [sp, #80] + sbcs x5, x15, x27 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x24, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #112] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #80] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x9, x11, x13 + and x1, x1, x13 + adcs x10, x4, x1 + and x1, x12, x13 + stp x9, x10, [sp, #64] + adcs x11, x7, xzr + adc x12, x17, x1 + stp x11, x12, [sp, #80] + mov x0, sp + mov x1, sp + ldp x5, x6, [x1] + subs x5, x5, x19 + sbcs x6, x6, x20 + ldp x7, x8, [x1, #16] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x24, x5, x3 + and x4, x3, #0xffffffff + adcs x25, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x7, x8, [x0, #16] + subs x5, x9, x19 + sbcs x6, x10, x20 + ldp x4, x3, [sp, #144] + sbcs x7, x11, x4 + sbcs x8, x12, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #96] + stp x7, x8, [sp, #112] + ldr q20, [x22, #64] + ldp x7, x17, [sp, #160] + ldr q0, [sp, #160] + ldp x6, x10, [x22, #64] + ldp x11, x15, [sp, #176] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x22, #80] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x22, #80] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #160] + adcs x19, x7, xzr + adc x20, x17, x1 + stp x19, x20, [sp, #176] + mov x0, sp + mov x1, sp + ldp x4, x3, [sp, #64] + subs x5, x24, x4 + sbcs x6, x25, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [sp, #80] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x9, x5, x3 + and x4, x3, #0xffffffff + adcs x10, x6, x4 + adcs x11, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x3, x8, x4 + stp x9, x10, [x0] + stp x11, x3, [x0, #16] + ldp x5, x6, [sp, #128] + subs x5, x5, x9 + sbcs x6, x6, x10 + ldp x7, x8, [sp, #144] + sbcs x7, x7, x11 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldr q20, [sp, #192] + ldp x7, x17, [sp, #96] + ldr q0, [sp, #96] + ldp x6, x10, [sp, #192] + ldp x11, x15, [sp, #112] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #208] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #112] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #208] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #96] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #112] + ldr q20, [x23, #64] + ldp x7, x17, [sp, #160] + ldr q0, [sp, #160] + ldp x6, x10, [x23, #64] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x19, x7 + ldr q20, [x23, #80] + sbcs x5, x20, x17 + ngc x17, xzr + subs x8, x19, x20 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x23, #80] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x24, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x25, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x24 + eor x1, x10, x5 + adcs x16, x2, x25 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x19, x11, x13 + and x1, x1, x13 + adcs x20, x4, x1 + and x1, x12, x13 + stp x19, x20, [sp, #160] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #176] + ldr q20, [sp, #128] + ldp x7, x17, [sp, #32] + ldr q0, [sp, #32] + ldp x6, x10, [sp, #128] + ldp x11, x15, [sp, #48] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #144] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #48] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #144] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x5, x11, x13 + and x1, x1, x13 + adcs x6, x4, x1 + and x1, x12, x13 + adcs x7, x7, xzr + adc x9, x17, x1 + ldp x4, x3, [sp, #96] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x4, x3, [sp, #112] + sbcs x7, x7, x4 + sbcs x8, x9, x3 + csetm x3, cc + adds x15, x5, x3 + and x4, x3, #0xffffffff + adcs x24, x6, x4 + adcs x25, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x26, x8, x4 + stp x15, x24, [sp, #128] + stp x25, x26, [sp, #144] + ldp x0, x1, [x22, #64] + ldp x2, x3, [x22, #80] + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne + ldp x4, x5, [x23, #64] + ldp x6, x7, [x23, #80] + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne + cmp x13, x12 + csel x8, x0, x19, cc + csel x9, x1, x20, cc + csel x8, x4, x8, hi + csel x9, x5, x9, hi + ldp x10, x11, [sp, #176] + csel x10, x2, x10, cc + csel x11, x3, x11, cc + csel x10, x6, x10, hi + csel x11, x7, x11, hi + ldp x12, x13, [x22] + ldp x0, x1, [sp] + csel x0, x12, x0, cc + csel x1, x13, x1, cc + ldp x12, x13, [x23] + csel x0, x12, x0, hi + csel x1, x13, x1, hi + ldp x12, x13, [x22, #16] + ldp x2, x3, [sp, #16] + csel x2, x12, x2, cc + csel x3, x13, x3, cc + ldp x12, x13, [x23, #16] + csel x2, x12, x2, hi + csel x3, x13, x3, hi + ldp x12, x13, [x22, #32] + csel x4, x12, x15, cc + csel x5, x13, x24, cc + ldp x12, x13, [x23, #32] + csel x4, x12, x4, hi + csel x5, x13, x5, hi + ldp x12, x13, [x22, #48] + csel x6, x12, x25, cc + csel x7, x13, x26, cc + ldp x12, x13, [x23, #48] + csel x6, x12, x6, hi + csel x7, x13, x7, hi + stp x0, x1, [x21] + stp x2, x3, [x21, #16] + stp x4, x5, [x21, #32] + stp x6, x7, [x21, #48] + stp x8, x9, [x21, #64] + stp x10, x11, [x21, #80] + + add sp, sp, NSPACE + ldp x27, x30, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjadd_alt.S new file mode 100644 index 00000000000..4849a2857a2 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjadd_alt.S @@ -0,0 +1,549 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjadd_alt +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x15 +#define input_x x16 +#define input_y x17 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds to bignum_montmul_p256_alt except registers + +#define montmul_p256(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x0, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x1, x3, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x1, x1, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x1, x1, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x0, x0, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x1, x1, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x0, x0, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x1, x1, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x0, x0, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + mov x10, #0xffffffff00000001 __LF \ + adds x13, x13, x12, lsl #32 __LF \ + lsr x11, x12, #32 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x12, x10 __LF \ + umulh x12, x12, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x12, x12, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x1, x1, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + adds x14, x14, x13, lsl #32 __LF \ + lsr x11, x13, #32 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x13, x10 __LF \ + umulh x13, x13, x10 __LF \ + adcs x12, x12, x11 __LF \ + adc x13, x13, xzr __LF \ + adds x0, x0, x14, lsl #32 __LF \ + lsr x11, x14, #32 __LF \ + adcs x12, x12, x11 __LF \ + mul x11, x14, x10 __LF \ + umulh x14, x14, x10 __LF \ + adcs x13, x13, x11 __LF \ + adc x14, x14, xzr __LF \ + adds x12, x12, x0, lsl #32 __LF \ + lsr x11, x0, #32 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x0, x10 __LF \ + umulh x0, x0, x10 __LF \ + adcs x14, x14, x11 __LF \ + adc x0, x0, xzr __LF \ + adds x12, x12, x1 __LF \ + adcs x13, x13, x3 __LF \ + adcs x14, x14, x4 __LF \ + adcs x0, x0, x5 __LF \ + cset x8, cs __LF \ + mov x11, #0xffffffff __LF \ + adds x1, x12, #0x1 __LF \ + sbcs x3, x13, x11 __LF \ + sbcs x4, x14, xzr __LF \ + sbcs x5, x0, x10 __LF \ + sbcs xzr, x8, xzr __LF \ + csel x12, x12, x1, cc __LF \ + csel x13, x13, x3, cc __LF \ + csel x14, x14, x4, cc __LF \ + csel x0, x0, x5, cc __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x0, [P0+16] + +// Corresponds exactly to bignum_montsqr_p256_alt + +#define montsqr_p256(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x6, x2, x4 __LF \ + umulh x7, x2, x4 __LF \ + adds x10, x10, x6 __LF \ + adcs x11, x11, x7 __LF \ + mul x6, x3, x4 __LF \ + umulh x7, x3, x4 __LF \ + adc x7, x7, xzr __LF \ + adds x11, x11, x6 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x7 __LF \ + mul x6, x3, x5 __LF \ + umulh x7, x3, x5 __LF \ + adc x7, x7, xzr __LF \ + adds x12, x12, x6 __LF \ + adcs x13, x13, x7 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x7, cs __LF \ + umulh x6, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x6 __LF \ + mul x6, x3, x3 __LF \ + adcs x10, x10, x6 __LF \ + umulh x6, x3, x3 __LF \ + adcs x11, x11, x6 __LF \ + mul x6, x4, x4 __LF \ + adcs x12, x12, x6 __LF \ + umulh x6, x4, x4 __LF \ + adcs x13, x13, x6 __LF \ + mul x6, x5, x5 __LF \ + adcs x14, x14, x6 __LF \ + umulh x6, x5, x5 __LF \ + adc x7, x7, x6 __LF \ + adds x9, x9, x8, lsl #32 __LF \ + lsr x3, x8, #32 __LF \ + adcs x10, x10, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x8, x3 __LF \ + umulh x8, x8, x3 __LF \ + adcs x11, x11, x2 __LF \ + adc x8, x8, xzr __LF \ + adds x10, x10, x9, lsl #32 __LF \ + lsr x3, x9, #32 __LF \ + adcs x11, x11, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x9, x3 __LF \ + umulh x9, x9, x3 __LF \ + adcs x8, x8, x2 __LF \ + adc x9, x9, xzr __LF \ + adds x11, x11, x10, lsl #32 __LF \ + lsr x3, x10, #32 __LF \ + adcs x8, x8, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x10, x3 __LF \ + umulh x10, x10, x3 __LF \ + adcs x9, x9, x2 __LF \ + adc x10, x10, xzr __LF \ + adds x8, x8, x11, lsl #32 __LF \ + lsr x3, x11, #32 __LF \ + adcs x9, x9, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x11, x3 __LF \ + umulh x11, x11, x3 __LF \ + adcs x10, x10, x2 __LF \ + adc x11, x11, xzr __LF \ + adds x8, x8, x12 __LF \ + adcs x9, x9, x13 __LF \ + adcs x10, x10, x14 __LF \ + adcs x11, x11, x7 __LF \ + cset x2, cs __LF \ + mov x3, #0xffffffff __LF \ + mov x5, #0xffffffff00000001 __LF \ + adds x12, x8, #0x1 __LF \ + sbcs x13, x9, x3 __LF \ + sbcs x14, x10, xzr __LF \ + sbcs x7, x11, x5 __LF \ + sbcs xzr, x2, xzr __LF \ + csel x8, x8, x12, cc __LF \ + csel x9, x9, x13, cc __LF \ + csel x10, x10, x14, cc __LF \ + csel x11, x11, x7, cc __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). + +#define amontsqr_p256(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x6, x2, x4 __LF \ + umulh x7, x2, x4 __LF \ + adds x10, x10, x6 __LF \ + adcs x11, x11, x7 __LF \ + mul x6, x3, x4 __LF \ + umulh x7, x3, x4 __LF \ + adc x7, x7, xzr __LF \ + adds x11, x11, x6 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x7 __LF \ + mul x6, x3, x5 __LF \ + umulh x7, x3, x5 __LF \ + adc x7, x7, xzr __LF \ + adds x12, x12, x6 __LF \ + adcs x13, x13, x7 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x7, cs __LF \ + umulh x6, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x6 __LF \ + mul x6, x3, x3 __LF \ + adcs x10, x10, x6 __LF \ + umulh x6, x3, x3 __LF \ + adcs x11, x11, x6 __LF \ + mul x6, x4, x4 __LF \ + adcs x12, x12, x6 __LF \ + umulh x6, x4, x4 __LF \ + adcs x13, x13, x6 __LF \ + mul x6, x5, x5 __LF \ + adcs x14, x14, x6 __LF \ + umulh x6, x5, x5 __LF \ + adc x7, x7, x6 __LF \ + adds x9, x9, x8, lsl #32 __LF \ + lsr x3, x8, #32 __LF \ + adcs x10, x10, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x8, x3 __LF \ + umulh x8, x8, x3 __LF \ + adcs x11, x11, x2 __LF \ + adc x8, x8, xzr __LF \ + adds x10, x10, x9, lsl #32 __LF \ + lsr x3, x9, #32 __LF \ + adcs x11, x11, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x9, x3 __LF \ + umulh x9, x9, x3 __LF \ + adcs x8, x8, x2 __LF \ + adc x9, x9, xzr __LF \ + adds x11, x11, x10, lsl #32 __LF \ + lsr x3, x10, #32 __LF \ + adcs x8, x8, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x10, x3 __LF \ + umulh x10, x10, x3 __LF \ + adcs x9, x9, x2 __LF \ + adc x10, x10, xzr __LF \ + adds x8, x8, x11, lsl #32 __LF \ + lsr x3, x11, #32 __LF \ + adcs x9, x9, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x11, x3 __LF \ + umulh x11, x11, x3 __LF \ + adcs x10, x10, x2 __LF \ + adc x11, x11, xzr __LF \ + adds x8, x8, x12 __LF \ + adcs x9, x9, x13 __LF \ + adcs x10, x10, x14 __LF \ + adcs x11, x11, x7 __LF \ + mov x2, #0xffffffffffffffff __LF \ + csel x2, xzr, x2, cc __LF \ + mov x3, #0xffffffff __LF \ + csel x3, xzr, x3, cc __LF \ + mov x5, #0xffffffff00000001 __LF \ + csel x5, xzr, x5, cc __LF \ + subs x8, x8, x2 __LF \ + sbcs x9, x9, x3 __LF \ + sbcs x10, x10, xzr __LF \ + sbc x11, x11, x5 __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Corresponds exactly to bignum_sub_p256 + +#define sub_p256(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + csetm x3, cc __LF \ + adds x5, x5, x3 __LF \ + mov x4, #0xffffffff __LF \ + and x4, x4, x3 __LF \ + adcs x6, x6, x4 __LF \ + adcs x7, x7, xzr __LF \ + mov x4, #0xffffffff00000001 __LF \ + and x4, x4, x3 __LF \ + adc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +S2N_BN_SYMBOL(p256_montjadd_alt): + +// Make room on stack for temporary variables +// Move the input arguments to stable places + + sub sp, sp, NSPACE + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 12 * multiply + 4 * square + 7 * subtract + + amontsqr_p256(z1sq,z_1) + amontsqr_p256(z2sq,z_2) + + montmul_p256(y1a,z_2,y_1) + montmul_p256(y2a,z_1,y_2) + + montmul_p256(x2a,z1sq,x_2) + montmul_p256(x1a,z2sq,x_1) + montmul_p256(y2a,z1sq,y2a) + montmul_p256(y1a,z2sq,y1a) + + sub_p256(xd,x2a,x1a) + sub_p256(yd,y2a,y1a) + + amontsqr_p256(zz,xd) + montsqr_p256(ww,yd) + + montmul_p256(zzx1,zz,x1a) + montmul_p256(zzx2,zz,x2a) + + sub_p256(resx,ww,zzx1) + sub_p256(t1,zzx2,zzx1) + + montmul_p256(xd,xd,z_1) + + sub_p256(resx,resx,zzx2) + + sub_p256(t2,zzx1,resx) + + montmul_p256(t1,t1,y1a) + montmul_p256(resz,xd,z_2) + montmul_p256(t2,yd,t2) + + sub_p256(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne + + ldp x4, x5, [z_2] + ldp x6, x7, [z_2+16] + + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne + + cmp x13, x12 + +// Multiplex the outputs accordingly, re-using the z's in registers + + ldp x8, x9, [resz] + csel x8, x0, x8, lo + csel x9, x1, x9, lo + csel x8, x4, x8, hi + csel x9, x5, x9, hi + ldp x10, x11, [resz+16] + csel x10, x2, x10, lo + csel x11, x3, x11, lo + csel x10, x6, x10, hi + csel x11, x7, x11, hi + + ldp x12, x13, [x_1] + ldp x0, x1, [resx] + csel x0, x12, x0, lo + csel x1, x13, x1, lo + ldp x12, x13, [x_2] + csel x0, x12, x0, hi + csel x1, x13, x1, hi + + ldp x12, x13, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x12, x2, lo + csel x3, x13, x3, lo + ldp x12, x13, [x_2+16] + csel x2, x12, x2, hi + csel x3, x13, x3, hi + + ldp x12, x13, [y_1] + ldp x4, x5, [resy] + csel x4, x12, x4, lo + csel x5, x13, x5, lo + ldp x12, x13, [y_2] + csel x4, x12, x4, hi + csel x5, x13, x5, hi + + ldp x12, x13, [y_1+16] + ldp x6, x7, [resy+16] + csel x6, x12, x6, lo + csel x7, x13, x7, lo + ldp x12, x13, [y_2+16] + csel x6, x12, x6, hi + csel x7, x13, x7, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [y_3] + stp x6, x7, [y_3+16] + stp x8, x9, [z_3] + stp x10, x11, [z_3+16] + +// Restore registers and return + + add sp, sp, NSPACE + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjdouble.S new file mode 100644 index 00000000000..6988a9439e3 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjdouble.S @@ -0,0 +1,1550 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjdouble +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + +// This is functionally equivalent to p256_montjdouble in unopt/p256_montjdouble.S. +// This is the result of doing the following sequence of optimizations: +// 1. Function inlining +// 2. Eliminating redundant load/store instructions +// 3. Folding (add addr, const) + load/store +// Function inlining is done manually. The second and third optimizations are +// done by a script. + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjdouble) + .text + .balign 4 + +#define NUMSIZE 32 +#define NSPACE #(NUMSIZE*6) + +S2N_BN_SYMBOL(p256_montjdouble): + + sub sp, sp, NSPACE+80 + stp x19, x20, [sp, NSPACE] + stp x21, x22, [sp, NSPACE+16] + stp x23, x24, [sp, NSPACE+32] + stp x25, x26, [sp, NSPACE+48] + stp x27, xzr, [sp, NSPACE+64] + + mov x19, x0 + mov x20, x1 + mov x0, sp + ldr q19, [x20, #64] + ldp x9, x13, [x20, #64] + ldr q23, [x20, #80] + ldr q0, [x20, #64] + ldp x1, x10, [x20, #80] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x21, x3, x16, cs + csel x22, x8, x14, cs + csel x23, x11, x12, cs + csel x24, x5, x2, cs + stp x22, x23, [x0, #16] + stp x21, x24, [x0] + ldr q19, [x20, #32] + ldp x9, x13, [x20, #32] + ldr q23, [x20, #48] + ldr q0, [x20, #32] + ldp x1, x10, [x20, #48] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x2, x5, x2, cs + stp x14, x12, [sp, #48] + stp x16, x2, [sp, #32] + ldp x5, x6, [x20, #0] + subs x5, x5, x21 + sbcs x6, x6, x24 + ldp x7, x8, [x20, #16] + sbcs x7, x7, x22 + sbcs x8, x8, x23 + csetm x3, cc + adds x10, x5, x3 + and x4, x3, #0xffffffff + adcs x25, x6, x4 + adcs x26, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x27, x8, x4 + stp x10, x25, [sp, #96] + stp x26, x27, [sp, #112] + ldp x5, x6, [x20] + adds x5, x5, x21 + adcs x6, x6, x24 + ldp x7, x8, [x20, #16] + adcs x7, x7, x22 + adcs x8, x8, x23 + csetm x3, cs + subs x9, x5, x3 + and x1, x3, #0xffffffff + sbcs x5, x6, x1 + sbcs x7, x7, xzr + and x2, x3, #0xffffffff00000001 + sbc x8, x8, x2 + stp x9, x5, [sp, #64] + stp x7, x8, [sp, #80] + ldr q20, [sp, #96] + ldr q0, [sp, #64] + rev64 v16.4s, v20.4s + subs x4, x9, x5 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x5, x25 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x7, x9 + ldr q20, [sp, #112] + sbcs x5, x8, x5 + ngc x17, xzr + subs x8, x7, x8 + uaddlp v27.2d, v16.4s + umulh x4, x9, x10 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x25, x10 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #80] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x10, x26 + sbcs x9, x25, x27 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x27, x26 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x21, x3, x13 + adcs x22, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x23, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x24, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x21 + adcs x15, x16, x22 + eor x5, x17, x4 + adcs x9, x1, x23 + eor x1, x10, x5 + adcs x16, x2, x24 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x21, x11, x13 + and x1, x1, x13 + adcs x22, x4, x1 + and x1, x12, x13 + stp x21, x22, [sp, #96] + adcs x23, x7, xzr + adc x24, x17, x1 + stp x23, x24, [sp, #112] + ldp x4, x5, [x20, #32] + ldp x8, x9, [x20, #64] + adds x4, x4, x8 + adcs x5, x5, x9 + ldp x6, x7, [x20, #48] + ldp x10, x11, [x20, #80] + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x3, xzr, xzr + adds x8, x4, #0x1 + mov x9, #0xffffffff + sbcs x9, x5, x9 + sbcs x10, x6, xzr + mov x11, #0xffffffff00000001 + sbcs x11, x7, x11 + sbcs x3, x3, xzr + csel x4, x4, x8, cc + csel x5, x5, x9, cc + csel x6, x6, x10, cc + csel x7, x7, x11, cc + stp x4, x5, [sp, #64] + stp x6, x7, [sp, #80] + ldr q20, [sp, #32] + ldp x7, x17, [x20, #0] + ldr q0, [x20, #0] + ldp x6, x10, [sp, #32] + ldp x11, x15, [x20, #16] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #48] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x20, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #48] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x20, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x20 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x20, x11, x13 + and x1, x1, x13 + adcs x25, x4, x1 + and x1, x12, x13 + stp x20, x25, [sp, #128] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #144] + ldr q19, [sp, #96] + ldr q23, [sp, #112] + ldr q0, [sp, #96] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x21, x22 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x21, x22 + umulh x15, x21, x23 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x21, x22 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x24, x23 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x22, x24 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x23, x24 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x23, x24 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x24, x24 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x24, x24 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x23, x23 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x23, x23 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x21, x3, x16, cs + csel x22, x8, x14, cs + csel x23, x11, x12, cs + csel x24, x5, x2, cs + ldr q19, [sp, #64] + ldp x9, x13, [sp, #64] + ldr q23, [sp, #80] + ldr q0, [sp, #64] + ldp x1, x10, [sp, #80] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x13, x3, x16, cs + csel x14, x8, x14, cs + csel x15, x11, x12, cs + csel x26, x5, x2, cs + mov x1, #0x9 + mov x2, #0xffffffffffffffff + subs x9, x2, x21 + mov x2, #0xffffffff + sbcs x10, x2, x24 + ngcs x11, x22 + mov x2, #0xffffffff00000001 + sbc x12, x2, x23 + mul x3, x1, x9 + mul x4, x1, x10 + mul x5, x1, x11 + mul x6, x1, x12 + umulh x9, x1, x9 + umulh x10, x1, x10 + umulh x11, x1, x11 + umulh x7, x1, x12 + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, xzr + mov x1, #0xc + mul x8, x20, x1 + umulh x9, x20, x1 + adds x3, x3, x8 + mul x8, x25, x1 + umulh x10, x25, x1 + adcs x4, x4, x8 + ldp x11, x12, [sp, #144] + mul x8, x11, x1 + umulh x11, x11, x1 + adcs x5, x5, x8 + mul x8, x12, x1 + umulh x12, x12, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x8, x7, #0x1 + lsl x10, x8, #32 + adds x6, x6, x10 + adc x7, x7, xzr + neg x9, x8 + sub x10, x10, #0x1 + subs x3, x3, x9 + sbcs x4, x4, x10 + sbcs x5, x5, xzr + sbcs x6, x6, x8 + sbc x8, x7, x8 + adds x20, x3, x8 + and x9, x8, #0xffffffff + adcs x21, x4, x9 + adcs x22, x5, xzr + neg x10, x9 + adc x23, x6, x10 + stp x20, x21, [sp, #160] + stp x22, x23, [sp, #176] + mov x2, sp + ldp x4, x3, [x2] + subs x5, x13, x4 + sbcs x6, x26, x3 + ldp x4, x3, [x2, #16] + sbcs x7, x14, x4 + sbcs x8, x15, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #64] + stp x7, x8, [sp, #80] + mov x0, sp + ldr q19, [sp, #32] + ldp x9, x13, [sp, #32] + ldr q23, [sp, #48] + ldr q0, [sp, #32] + ldp x1, x10, [sp, #48] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x24, x3, x16, cs + csel x25, x8, x14, cs + csel x26, x11, x12, cs + csel x27, x5, x2, cs + stp x25, x26, [x0, #16] + stp x24, x27, [x0] + ldr q20, [sp, #96] + ldr q0, [sp, #160] + ldp x6, x10, [sp, #96] + rev64 v16.4s, v20.4s + subs x4, x20, x21 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x21, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x22, x20 + ldr q20, [sp, #112] + sbcs x5, x23, x21 + ngc x17, xzr + subs x8, x22, x23 + uaddlp v27.2d, v16.4s + umulh x4, x20, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #112] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x20, x3, x13 + adcs x21, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x22, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x23, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x20 + adcs x15, x16, x21 + eor x5, x17, x4 + adcs x9, x1, x22 + eor x1, x10, x5 + adcs x16, x2, x23 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x14, x11, x13 + and x1, x1, x13 + adcs x15, x4, x1 + and x1, x12, x13 + stp x14, x15, [sp, #96] + adcs x13, x7, xzr + adc x20, x17, x1 + stp x13, x20, [sp, #112] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #32] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #48] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [x19, #64] + stp x7, x8, [x19, #80] + ldp x1, x2, [sp, #128] + lsl x0, x1, #2 + ldp x6, x7, [sp, #160] + subs x0, x0, x6 + extr x1, x2, x1, #62 + sbcs x1, x1, x7 + ldp x3, x4, [sp, #144] + extr x2, x3, x2, #62 + ldp x6, x7, [sp, #176] + sbcs x2, x2, x6 + extr x3, x4, x3, #62 + sbcs x3, x3, x7 + lsr x4, x4, #62 + sbc x4, x4, xzr + add x5, x4, #0x1 + lsl x8, x5, #32 + negs x6, x8 + ngcs x7, xzr + sbc x8, x8, x5 + adds x0, x0, x5 + adcs x1, x1, x6 + adcs x2, x2, x7 + adcs x3, x3, x8 + csetm x5, cc + adds x0, x0, x5 + and x6, x5, #0xffffffff + adcs x1, x1, x6 + adcs x2, x2, xzr + neg x7, x6 + adc x3, x3, x7 + stp x0, x1, [x19] + stp x2, x3, [x19, #16] + mov x2, #0xffffffffffffffff + subs x9, x2, x24 + mov x2, #0xffffffff + sbcs x10, x2, x27 + ngcs x11, x25 + mov x2, #0xffffffff00000001 + sbc x12, x2, x26 + lsl x3, x9, #3 + extr x4, x10, x9, #61 + extr x5, x11, x10, #61 + extr x6, x12, x11, #61 + lsr x7, x12, #61 + mov x1, #0x3 + mul x8, x14, x1 + umulh x9, x14, x1 + adds x3, x3, x8 + mul x8, x15, x1 + umulh x10, x15, x1 + adcs x4, x4, x8 + mul x8, x13, x1 + umulh x11, x13, x1 + adcs x5, x5, x8 + mul x8, x20, x1 + umulh x12, x20, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x8, x7, #0x1 + lsl x10, x8, #32 + adds x6, x6, x10 + adc x7, x7, xzr + neg x9, x8 + sub x10, x10, #0x1 + subs x3, x3, x9 + sbcs x4, x4, x10 + sbcs x5, x5, xzr + sbcs x6, x6, x8 + sbc x8, x7, x8 + adds x3, x3, x8 + and x9, x8, #0xffffffff + adcs x4, x4, x9 + adcs x5, x5, xzr + neg x10, x9 + adc x6, x6, x10 + stp x3, x4, [x19, #32] + stp x5, x6, [x19, #48] + + ldp x27, xzr, [sp, NSPACE+64] + ldp x25, x26, [sp, NSPACE+48] + ldp x23, x24, [sp, NSPACE+32] + ldp x21, x22, [sp, NSPACE+16] + ldp x19, x20, [sp, NSPACE] + add sp, sp, NSPACE+80 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjdouble_alt.S new file mode 100644 index 00000000000..d6079cccf4a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjdouble_alt.S @@ -0,0 +1,582 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjdouble_alt +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjdouble_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x15 +#define input_x x16 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 sp, #(NUMSIZE*0) +#define y4 sp, #(NUMSIZE*0) + +#define y2 sp, #(NUMSIZE*1) + +#define t1 sp, #(NUMSIZE*2) + +#define t2 sp, #(NUMSIZE*3) +#define x2p sp, #(NUMSIZE*3) +#define dx2 sp, #(NUMSIZE*3) + +#define xy2 sp, #(NUMSIZE*4) + +#define x4p sp, #(NUMSIZE*5) +#define d sp, #(NUMSIZE*5) + +#define NSPACE #(NUMSIZE*6) + +// Corresponds exactly to bignum_montmul_p256_alt except registers + +#define montmul_p256(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x0, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x1, x3, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x1, x1, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x1, x1, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x0, x0, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x1, x1, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x0, x0, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x1, x1, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x0, x0, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + mov x10, #0xffffffff00000001 __LF \ + adds x13, x13, x12, lsl #32 __LF \ + lsr x11, x12, #32 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x12, x10 __LF \ + umulh x12, x12, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x12, x12, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x1, x1, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + adds x14, x14, x13, lsl #32 __LF \ + lsr x11, x13, #32 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x13, x10 __LF \ + umulh x13, x13, x10 __LF \ + adcs x12, x12, x11 __LF \ + adc x13, x13, xzr __LF \ + adds x0, x0, x14, lsl #32 __LF \ + lsr x11, x14, #32 __LF \ + adcs x12, x12, x11 __LF \ + mul x11, x14, x10 __LF \ + umulh x14, x14, x10 __LF \ + adcs x13, x13, x11 __LF \ + adc x14, x14, xzr __LF \ + adds x12, x12, x0, lsl #32 __LF \ + lsr x11, x0, #32 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x0, x10 __LF \ + umulh x0, x0, x10 __LF \ + adcs x14, x14, x11 __LF \ + adc x0, x0, xzr __LF \ + adds x12, x12, x1 __LF \ + adcs x13, x13, x3 __LF \ + adcs x14, x14, x4 __LF \ + adcs x0, x0, x5 __LF \ + cset x8, cs __LF \ + mov x11, #0xffffffff __LF \ + adds x1, x12, #0x1 __LF \ + sbcs x3, x13, x11 __LF \ + sbcs x4, x14, xzr __LF \ + sbcs x5, x0, x10 __LF \ + sbcs xzr, x8, xzr __LF \ + csel x12, x12, x1, cc __LF \ + csel x13, x13, x3, cc __LF \ + csel x14, x14, x4, cc __LF \ + csel x0, x0, x5, cc __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x0, [P0+16] + +// Corresponds exactly to bignum_montsqr_p256_alt + +#define montsqr_p256(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x6, x2, x4 __LF \ + umulh x7, x2, x4 __LF \ + adds x10, x10, x6 __LF \ + adcs x11, x11, x7 __LF \ + mul x6, x3, x4 __LF \ + umulh x7, x3, x4 __LF \ + adc x7, x7, xzr __LF \ + adds x11, x11, x6 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x7 __LF \ + mul x6, x3, x5 __LF \ + umulh x7, x3, x5 __LF \ + adc x7, x7, xzr __LF \ + adds x12, x12, x6 __LF \ + adcs x13, x13, x7 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x7, hs __LF \ + umulh x6, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x6 __LF \ + mul x6, x3, x3 __LF \ + adcs x10, x10, x6 __LF \ + umulh x6, x3, x3 __LF \ + adcs x11, x11, x6 __LF \ + mul x6, x4, x4 __LF \ + adcs x12, x12, x6 __LF \ + umulh x6, x4, x4 __LF \ + adcs x13, x13, x6 __LF \ + mul x6, x5, x5 __LF \ + adcs x14, x14, x6 __LF \ + umulh x6, x5, x5 __LF \ + adc x7, x7, x6 __LF \ + mov x5, #-4294967295 __LF \ + adds x9, x9, x8, lsl #32 __LF \ + lsr x3, x8, #32 __LF \ + adcs x10, x10, x3 __LF \ + mul x2, x8, x5 __LF \ + umulh x8, x8, x5 __LF \ + adcs x11, x11, x2 __LF \ + adc x8, x8, xzr __LF \ + adds x10, x10, x9, lsl #32 __LF \ + lsr x3, x9, #32 __LF \ + adcs x11, x11, x3 __LF \ + mul x2, x9, x5 __LF \ + umulh x9, x9, x5 __LF \ + adcs x8, x8, x2 __LF \ + adc x9, x9, xzr __LF \ + adds x11, x11, x10, lsl #32 __LF \ + lsr x3, x10, #32 __LF \ + adcs x8, x8, x3 __LF \ + mul x2, x10, x5 __LF \ + umulh x10, x10, x5 __LF \ + adcs x9, x9, x2 __LF \ + adc x10, x10, xzr __LF \ + adds x8, x8, x11, lsl #32 __LF \ + lsr x3, x11, #32 __LF \ + adcs x9, x9, x3 __LF \ + mul x2, x11, x5 __LF \ + umulh x11, x11, x5 __LF \ + adcs x10, x10, x2 __LF \ + adc x11, x11, xzr __LF \ + adds x8, x8, x12 __LF \ + adcs x9, x9, x13 __LF \ + adcs x10, x10, x14 __LF \ + adcs x11, x11, x7 __LF \ + cset x2, hs __LF \ + mov x3, #4294967295 __LF \ + adds x12, x8, #1 __LF \ + sbcs x13, x9, x3 __LF \ + sbcs x14, x10, xzr __LF \ + sbcs x7, x11, x5 __LF \ + sbcs xzr, x2, xzr __LF \ + csel x8, x8, x12, lo __LF \ + csel x9, x9, x13, lo __LF \ + csel x10, x10, x14, lo __LF \ + csel x11, x11, x7, lo __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Corresponds exactly to bignum_sub_p256 + +#define sub_p256(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + csetm x3, lo __LF \ + adds x5, x5, x3 __LF \ + and x4, x3, #0xffffffff __LF \ + adcs x6, x6, x4 __LF \ + adcs x7, x7, xzr __LF \ + and x4, x3, #0xffffffff00000001 __LF \ + adc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Corresponds exactly to bignum_add_p256 + +#define add_p256(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + adds x5, x5, x4 __LF \ + adcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + adcs x7, x7, x4 __LF \ + adcs x8, x8, x3 __LF \ + adc x3, xzr, xzr __LF \ + cmn x5, #1 __LF \ + mov x4, #4294967295 __LF \ + sbcs xzr, x6, x4 __LF \ + sbcs xzr, x7, xzr __LF \ + mov x4, #-4294967295 __LF \ + sbcs xzr, x8, x4 __LF \ + adcs x3, x3, xzr __LF \ + csetm x3, ne __LF \ + subs x5, x5, x3 __LF \ + and x4, x3, #0xffffffff __LF \ + sbcs x6, x6, x4 __LF \ + sbcs x7, x7, xzr __LF \ + and x4, x3, #0xffffffff00000001 __LF \ + sbc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// A weak version of add that only guarantees sum in 4 digits + +#define weakadd_p256(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + adds x5, x5, x4 __LF \ + adcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + adcs x7, x7, x4 __LF \ + adcs x8, x8, x3 __LF \ + csetm x3, cs __LF \ + subs x5, x5, x3 __LF \ + and x1, x3, #4294967295 __LF \ + sbcs x6, x6, x1 __LF \ + sbcs x7, x7, xzr __LF \ + and x2, x3, #-4294967295 __LF \ + sbc x8, x8, x2 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// P0 = C * P1 - D * P2 computed as D * (p_256 - P2) + C * P1 +// Quotient estimation is done just as q = h + 1 as in bignum_triple_p256 +// This also applies to the other functions following. + +#define cmsub_p256(P0,C,P1,D,P2) \ + mov x1, D __LF \ + mov x2, #-1 __LF \ + ldp x9, x10, [P2] __LF \ + subs x9, x2, x9 __LF \ + mov x2, #4294967295 __LF \ + sbcs x10, x2, x10 __LF \ + ldp x11, x12, [P2+16] __LF \ + sbcs x11, xzr, x11 __LF \ + mov x2, #-4294967295 __LF \ + sbc x12, x2, x12 __LF \ + mul x3, x1, x9 __LF \ + mul x4, x1, x10 __LF \ + mul x5, x1, x11 __LF \ + mul x6, x1, x12 __LF \ + umulh x9, x1, x9 __LF \ + umulh x10, x1, x10 __LF \ + umulh x11, x1, x11 __LF \ + umulh x7, x1, x12 __LF \ + adds x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + adcs x6, x6, x11 __LF \ + adc x7, x7, xzr __LF \ + mov x1, C __LF \ + ldp x9, x10, [P1] __LF \ + mul x8, x9, x1 __LF \ + umulh x9, x9, x1 __LF \ + adds x3, x3, x8 __LF \ + mul x8, x10, x1 __LF \ + umulh x10, x10, x1 __LF \ + adcs x4, x4, x8 __LF \ + ldp x11, x12, [P1+16] __LF \ + mul x8, x11, x1 __LF \ + umulh x11, x11, x1 __LF \ + adcs x5, x5, x8 __LF \ + mul x8, x12, x1 __LF \ + umulh x12, x12, x1 __LF \ + adcs x6, x6, x8 __LF \ + adc x7, x7, xzr __LF \ + adds x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + adcs x6, x6, x11 __LF \ + adc x7, x7, x12 __LF \ + add x8, x7, #1 __LF \ + lsl x10, x8, #32 __LF \ + adds x6, x6, x10 __LF \ + adc x7, x7, xzr __LF \ + neg x9, x8 __LF \ + sub x10, x10, #1 __LF \ + subs x3, x3, x9 __LF \ + sbcs x4, x4, x10 __LF \ + sbcs x5, x5, xzr __LF \ + sbcs x6, x6, x8 __LF \ + sbc x8, x7, x8 __LF \ + adds x3, x3, x8 __LF \ + and x9, x8, #4294967295 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, xzr __LF \ + neg x10, x9 __LF \ + adc x6, x6, x10 __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +// P0 = 4 * P1 - P2, by direct subtraction of P2; the method +// in bignum_cmul_p256 etc. for quotient estimation still +// works when the value to be reduced is negative, as +// long as it is > -p_256, which is the case here. The +// actual accumulation of q * p_256 is done a bit differently +// so it works for the q = 0 case. + +#define cmsub41_p256(P0,P1,P2) \ + ldp x1, x2, [P1] __LF \ + lsl x0, x1, #2 __LF \ + ldp x6, x7, [P2] __LF \ + subs x0, x0, x6 __LF \ + extr x1, x2, x1, #62 __LF \ + sbcs x1, x1, x7 __LF \ + ldp x3, x4, [P1+16] __LF \ + extr x2, x3, x2, #62 __LF \ + ldp x6, x7, [P2+16] __LF \ + sbcs x2, x2, x6 __LF \ + extr x3, x4, x3, #62 __LF \ + sbcs x3, x3, x7 __LF \ + lsr x4, x4, #62 __LF \ + sbc x4, x4, xzr __LF \ + add x5, x4, #1 __LF \ + lsl x8, x5, #32 __LF \ + subs x6, xzr, x8 __LF \ + sbcs x7, xzr, xzr __LF \ + sbc x8, x8, x5 __LF \ + adds x0, x0, x5 __LF \ + adcs x1, x1, x6 __LF \ + adcs x2, x2, x7 __LF \ + adcs x3, x3, x8 __LF \ + csetm x5, cc __LF \ + adds x0, x0, x5 __LF \ + and x6, x5, #4294967295 __LF \ + adcs x1, x1, x6 __LF \ + adcs x2, x2, xzr __LF \ + neg x7, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +// P0 = 3 * P1 - 8 * P2, computed as (p_256 - P2) << 3 + 3 * P1 + +#define cmsub38_p256(P0,P1,P2) \ + mov x1, 8 __LF \ + mov x2, #-1 __LF \ + ldp x9, x10, [P2] __LF \ + subs x9, x2, x9 __LF \ + mov x2, #4294967295 __LF \ + sbcs x10, x2, x10 __LF \ + ldp x11, x12, [P2+16] __LF \ + sbcs x11, xzr, x11 __LF \ + mov x2, #-4294967295 __LF \ + sbc x12, x2, x12 __LF \ + lsl x3, x9, #3 __LF \ + extr x4, x10, x9, #61 __LF \ + extr x5, x11, x10, #61 __LF \ + extr x6, x12, x11, #61 __LF \ + lsr x7, x12, #61 __LF \ + mov x1, 3 __LF \ + ldp x9, x10, [P1] __LF \ + mul x8, x9, x1 __LF \ + umulh x9, x9, x1 __LF \ + adds x3, x3, x8 __LF \ + mul x8, x10, x1 __LF \ + umulh x10, x10, x1 __LF \ + adcs x4, x4, x8 __LF \ + ldp x11, x12, [P1+16] __LF \ + mul x8, x11, x1 __LF \ + umulh x11, x11, x1 __LF \ + adcs x5, x5, x8 __LF \ + mul x8, x12, x1 __LF \ + umulh x12, x12, x1 __LF \ + adcs x6, x6, x8 __LF \ + adc x7, x7, xzr __LF \ + adds x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + adcs x6, x6, x11 __LF \ + adc x7, x7, x12 __LF \ + add x8, x7, #1 __LF \ + lsl x10, x8, #32 __LF \ + adds x6, x6, x10 __LF \ + adc x7, x7, xzr __LF \ + neg x9, x8 __LF \ + sub x10, x10, #1 __LF \ + subs x3, x3, x9 __LF \ + sbcs x4, x4, x10 __LF \ + sbcs x5, x5, xzr __LF \ + sbcs x6, x6, x8 __LF \ + sbc x8, x7, x8 __LF \ + adds x3, x3, x8 __LF \ + and x9, x8, #4294967295 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, xzr __LF \ + neg x10, x9 __LF \ + adc x6, x6, x10 __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +S2N_BN_SYMBOL(p256_montjdouble_alt): + +// Make room on stack for temporary variables + + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_p256(z2,z_1) + montsqr_p256(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + sub_p256(t2,x_1,z2) + weakadd_p256(t1,x_1,z2) + montmul_p256(x2p,t1,t2) + +// t1 = y + z +// xy2 = x * y^2 +// x4p = x2p^2 + + add_p256(t1,y_1,z_1) + montmul_p256(xy2,x_1,y2) + montsqr_p256(x4p,x2p) + +// t1 = (y + z)^2 + + montsqr_p256(t1,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p256(d,12,xy2,9,x4p) + sub_p256(t1,t1,z2) + +// y4 = y^4 + + montsqr_p256(y4,y2) + +// dx2 = d * x2p + + montmul_p256(dx2,d,x2p) + +// z_3' = 2 * y * z + + sub_p256(z_3,t1,y2) + +// x' = 4 * xy2 - d + + cmsub41_p256(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p256(y_3,dx2,y4) + +// Restore stack and return + + add sp, sp, NSPACE + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjmixadd.S new file mode 100644 index 00000000000..bd388d03e7a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjmixadd.S @@ -0,0 +1,507 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjmixadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjmixadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjmixadd) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x17 +#define input_x x19 +#define input_y x20 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds to bignum_montmul_p256 but uses x0 in place of x17 + +#define montmul_p256(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2] __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x7 __LF \ + mul x13, x4, x8 __LF \ + umulh x12, x3, x7 __LF \ + adds x16, x11, x13 __LF \ + umulh x14, x4, x8 __LF \ + adcs x0, x12, x14 __LF \ + adcs x14, x14, xzr __LF \ + adds x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adcs x14, x14, xzr __LF \ + subs x15, x3, x4 __LF \ + cneg x15, x15, lo __LF \ + csetm x1, lo __LF \ + subs x0, x8, x7 __LF \ + cneg x0, x0, lo __LF \ + mul x16, x15, x0 __LF \ + umulh x0, x15, x0 __LF \ + cinv x1, x1, lo __LF \ + eor x16, x16, x1 __LF \ + eor x0, x0, x1 __LF \ + cmn x1, #1 __LF \ + adcs x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adc x14, x14, x1 __LF \ + lsl x0, x11, #32 __LF \ + subs x1, x11, x0 __LF \ + lsr x16, x11, #32 __LF \ + sbc x11, x11, x16 __LF \ + adds x12, x12, x0 __LF \ + adcs x13, x13, x16 __LF \ + adcs x14, x14, x1 __LF \ + adc x11, x11, xzr __LF \ + lsl x0, x12, #32 __LF \ + subs x1, x12, x0 __LF \ + lsr x16, x12, #32 __LF \ + sbc x12, x12, x16 __LF \ + adds x13, x13, x0 __LF \ + adcs x14, x14, x16 __LF \ + adcs x11, x11, x1 __LF \ + adc x12, x12, xzr __LF \ + stp x13, x14, [P0] __LF \ + stp x11, x12, [P0+16] __LF \ + mul x11, x5, x9 __LF \ + mul x13, x6, x10 __LF \ + umulh x12, x5, x9 __LF \ + adds x16, x11, x13 __LF \ + umulh x14, x6, x10 __LF \ + adcs x0, x12, x14 __LF \ + adcs x14, x14, xzr __LF \ + adds x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adcs x14, x14, xzr __LF \ + subs x15, x5, x6 __LF \ + cneg x15, x15, lo __LF \ + csetm x1, lo __LF \ + subs x0, x10, x9 __LF \ + cneg x0, x0, lo __LF \ + mul x16, x15, x0 __LF \ + umulh x0, x15, x0 __LF \ + cinv x1, x1, lo __LF \ + eor x16, x16, x1 __LF \ + eor x0, x0, x1 __LF \ + cmn x1, #1 __LF \ + adcs x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adc x14, x14, x1 __LF \ + subs x3, x5, x3 __LF \ + sbcs x4, x6, x4 __LF \ + ngc x5, xzr __LF \ + cmn x5, #1 __LF \ + eor x3, x3, x5 __LF \ + adcs x3, x3, xzr __LF \ + eor x4, x4, x5 __LF \ + adcs x4, x4, xzr __LF \ + subs x7, x7, x9 __LF \ + sbcs x8, x8, x10 __LF \ + ngc x9, xzr __LF \ + cmn x9, #1 __LF \ + eor x7, x7, x9 __LF \ + adcs x7, x7, xzr __LF \ + eor x8, x8, x9 __LF \ + adcs x8, x8, xzr __LF \ + eor x10, x5, x9 __LF \ + ldp x15, x1, [P0] __LF \ + adds x15, x11, x15 __LF \ + adcs x1, x12, x1 __LF \ + ldp x5, x9, [P0+16] __LF \ + adcs x5, x13, x5 __LF \ + adcs x9, x14, x9 __LF \ + adc x2, xzr, xzr __LF \ + mul x11, x3, x7 __LF \ + mul x13, x4, x8 __LF \ + umulh x12, x3, x7 __LF \ + adds x16, x11, x13 __LF \ + umulh x14, x4, x8 __LF \ + adcs x0, x12, x14 __LF \ + adcs x14, x14, xzr __LF \ + adds x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adcs x14, x14, xzr __LF \ + subs x3, x3, x4 __LF \ + cneg x3, x3, lo __LF \ + csetm x4, lo __LF \ + subs x0, x8, x7 __LF \ + cneg x0, x0, lo __LF \ + mul x16, x3, x0 __LF \ + umulh x0, x3, x0 __LF \ + cinv x4, x4, lo __LF \ + eor x16, x16, x4 __LF \ + eor x0, x0, x4 __LF \ + cmn x4, #1 __LF \ + adcs x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adc x14, x14, x4 __LF \ + cmn x10, #1 __LF \ + eor x11, x11, x10 __LF \ + adcs x11, x11, x15 __LF \ + eor x12, x12, x10 __LF \ + adcs x12, x12, x1 __LF \ + eor x13, x13, x10 __LF \ + adcs x13, x13, x5 __LF \ + eor x14, x14, x10 __LF \ + adcs x14, x14, x9 __LF \ + adcs x3, x2, x10 __LF \ + adcs x4, x10, xzr __LF \ + adc x10, x10, xzr __LF \ + adds x13, x13, x15 __LF \ + adcs x14, x14, x1 __LF \ + adcs x3, x3, x5 __LF \ + adcs x4, x4, x9 __LF \ + adc x10, x10, x2 __LF \ + lsl x0, x11, #32 __LF \ + subs x1, x11, x0 __LF \ + lsr x16, x11, #32 __LF \ + sbc x11, x11, x16 __LF \ + adds x12, x12, x0 __LF \ + adcs x13, x13, x16 __LF \ + adcs x14, x14, x1 __LF \ + adc x11, x11, xzr __LF \ + lsl x0, x12, #32 __LF \ + subs x1, x12, x0 __LF \ + lsr x16, x12, #32 __LF \ + sbc x12, x12, x16 __LF \ + adds x13, x13, x0 __LF \ + adcs x14, x14, x16 __LF \ + adcs x11, x11, x1 __LF \ + adc x12, x12, xzr __LF \ + adds x3, x3, x11 __LF \ + adcs x4, x4, x12 __LF \ + adc x10, x10, xzr __LF \ + add x2, x10, #1 __LF \ + lsl x16, x2, #32 __LF \ + adds x4, x4, x16 __LF \ + adc x10, x10, xzr __LF \ + neg x15, x2 __LF \ + sub x16, x16, #1 __LF \ + subs x13, x13, x15 __LF \ + sbcs x14, x14, x16 __LF \ + sbcs x3, x3, xzr __LF \ + sbcs x4, x4, x2 __LF \ + sbcs x7, x10, x2 __LF \ + adds x13, x13, x7 __LF \ + mov x10, #4294967295 __LF \ + and x10, x10, x7 __LF \ + adcs x14, x14, x10 __LF \ + adcs x3, x3, xzr __LF \ + mov x10, #-4294967295 __LF \ + and x10, x10, x7 __LF \ + adc x4, x4, x10 __LF \ + stp x13, x14, [P0] __LF \ + stp x3, x4, [P0+16] + +// Corresponds to bignum_montsqr_p256 but uses x0 in place of x17 + +#define montsqr_p256(P0,P1) \ + ldp x2, x3, [P1] __LF \ + ldp x4, x5, [P1+16] __LF \ + umull x15, w2, w2 __LF \ + lsr x11, x2, #32 __LF \ + umull x16, w11, w11 __LF \ + umull x11, w2, w11 __LF \ + adds x15, x15, x11, lsl #33 __LF \ + lsr x11, x11, #31 __LF \ + adc x16, x16, x11 __LF \ + umull x0, w3, w3 __LF \ + lsr x11, x3, #32 __LF \ + umull x1, w11, w11 __LF \ + umull x11, w3, w11 __LF \ + mul x12, x2, x3 __LF \ + umulh x13, x2, x3 __LF \ + adds x0, x0, x11, lsl #33 __LF \ + lsr x11, x11, #31 __LF \ + adc x1, x1, x11 __LF \ + adds x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adc x1, x1, xzr __LF \ + adds x16, x16, x12 __LF \ + adcs x0, x0, x13 __LF \ + adc x1, x1, xzr __LF \ + lsl x12, x15, #32 __LF \ + subs x13, x15, x12 __LF \ + lsr x11, x15, #32 __LF \ + sbc x15, x15, x11 __LF \ + adds x16, x16, x12 __LF \ + adcs x0, x0, x11 __LF \ + adcs x1, x1, x13 __LF \ + adc x15, x15, xzr __LF \ + lsl x12, x16, #32 __LF \ + subs x13, x16, x12 __LF \ + lsr x11, x16, #32 __LF \ + sbc x16, x16, x11 __LF \ + adds x0, x0, x12 __LF \ + adcs x1, x1, x11 __LF \ + adcs x15, x15, x13 __LF \ + adc x16, x16, xzr __LF \ + mul x6, x2, x4 __LF \ + mul x14, x3, x5 __LF \ + umulh x8, x2, x4 __LF \ + subs x10, x2, x3 __LF \ + cneg x10, x10, lo __LF \ + csetm x13, lo __LF \ + subs x12, x5, x4 __LF \ + cneg x12, x12, lo __LF \ + mul x11, x10, x12 __LF \ + umulh x12, x10, x12 __LF \ + cinv x13, x13, lo __LF \ + eor x11, x11, x13 __LF \ + eor x12, x12, x13 __LF \ + adds x7, x6, x8 __LF \ + adc x8, x8, xzr __LF \ + umulh x9, x3, x5 __LF \ + adds x7, x7, x14 __LF \ + adcs x8, x8, x9 __LF \ + adc x9, x9, xzr __LF \ + adds x8, x8, x14 __LF \ + adc x9, x9, xzr __LF \ + cmn x13, #1 __LF \ + adcs x7, x7, x11 __LF \ + adcs x8, x8, x12 __LF \ + adc x9, x9, x13 __LF \ + adds x6, x6, x6 __LF \ + adcs x7, x7, x7 __LF \ + adcs x8, x8, x8 __LF \ + adcs x9, x9, x9 __LF \ + adc x10, xzr, xzr __LF \ + adds x6, x6, x0 __LF \ + adcs x7, x7, x1 __LF \ + adcs x8, x8, x15 __LF \ + adcs x9, x9, x16 __LF \ + adc x10, x10, xzr __LF \ + lsl x12, x6, #32 __LF \ + subs x13, x6, x12 __LF \ + lsr x11, x6, #32 __LF \ + sbc x6, x6, x11 __LF \ + adds x7, x7, x12 __LF \ + adcs x8, x8, x11 __LF \ + adcs x9, x9, x13 __LF \ + adcs x10, x10, x6 __LF \ + adc x6, xzr, xzr __LF \ + lsl x12, x7, #32 __LF \ + subs x13, x7, x12 __LF \ + lsr x11, x7, #32 __LF \ + sbc x7, x7, x11 __LF \ + adds x8, x8, x12 __LF \ + adcs x9, x9, x11 __LF \ + adcs x10, x10, x13 __LF \ + adcs x6, x6, x7 __LF \ + adc x7, xzr, xzr __LF \ + mul x11, x4, x4 __LF \ + adds x8, x8, x11 __LF \ + mul x12, x5, x5 __LF \ + umulh x11, x4, x4 __LF \ + adcs x9, x9, x11 __LF \ + adcs x10, x10, x12 __LF \ + umulh x12, x5, x5 __LF \ + adcs x6, x6, x12 __LF \ + adc x7, x7, xzr __LF \ + mul x11, x4, x5 __LF \ + umulh x12, x4, x5 __LF \ + adds x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adc x13, xzr, xzr __LF \ + adds x9, x9, x11 __LF \ + adcs x10, x10, x12 __LF \ + adcs x6, x6, x13 __LF \ + adcs x7, x7, xzr __LF \ + mov x11, #4294967295 __LF \ + adds x5, x8, #1 __LF \ + sbcs x11, x9, x11 __LF \ + mov x13, #-4294967295 __LF \ + sbcs x12, x10, xzr __LF \ + sbcs x13, x6, x13 __LF \ + sbcs xzr, x7, xzr __LF \ + csel x8, x5, x8, hs __LF \ + csel x9, x11, x9, hs __LF \ + csel x10, x12, x10, hs __LF \ + csel x6, x13, x6, hs __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x6, [P0+16] + +// Corresponds exactly to bignum_sub_p256 + +#define sub_p256(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + csetm x3, cc __LF \ + adds x5, x5, x3 __LF \ + mov x4, #0xffffffff __LF \ + and x4, x4, x3 __LF \ + adcs x6, x6, x4 __LF \ + adcs x7, x7, xzr __LF \ + mov x4, #0xffffffff00000001 __LF \ + and x4, x4, x3 __LF \ + adc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +S2N_BN_SYMBOL(p256_montjmixadd): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + montsqr_p256(zp2,z_1) + montmul_p256(y2a,z_1,y_2) + + montmul_p256(x2a,zp2,x_2) + montmul_p256(y2a,zp2,y2a) + + sub_p256(xd,x2a,x_1) + sub_p256(yd,y2a,y_1) + + montsqr_p256(zz,xd) + montsqr_p256(ww,yd) + + montmul_p256(zzx1,zz,x_1) + montmul_p256(zzx2,zz,x2a) + + sub_p256(resx,ww,zzx1) + sub_p256(t1,zzx2,zzx1) + + montmul_p256(resz,xd,z_1) + + sub_p256(resx,resx,zzx2) + + sub_p256(t2,zzx1,resx) + + montmul_p256(t1,t1,y_1) + montmul_p256(t2,yd,t2) + + sub_p256(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + orr x4, x0, x1 + orr x5, x2, x3 + orr x4, x4, x5 + cmp x4, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^256 - p_256), +// hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x12, x13, [x_2] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x2, x3, [resx+16] + ldp x12, x13, [x_2+16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + + ldp x4, x5, [resy] + ldp x12, x13, [y_2] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x6, x7, [resy+16] + ldp x12, x13, [y_2+16] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + + ldp x8, x9, [resz] + mov x12, #0x0000000000000001 + mov x13, #0xffffffff00000000 + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x10, x11, [resz+16] + mov x12, #0xffffffffffffffff + mov x13, #0x00000000fffffffe + csel x10, x10, x12, ne + csel x11, x11, x13, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [y_3] + stp x6, x7, [y_3+16] + stp x8, x9, [z_3] + stp x10, x11, [z_3+16] + +// Restore registers and return + + add sp, sp, NSPACE + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjmixadd_alt.S new file mode 100644 index 00000000000..90f49bc3568 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjmixadd_alt.S @@ -0,0 +1,511 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjmixadd_alt +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjmixadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x15 +#define input_x x16 +#define input_y x17 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds to bignum_montmul_p256_alt except registers + +#define montmul_p256(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x0, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x1, x3, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x1, x1, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x1, x1, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x0, x0, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x1, x1, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x0, x0, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x1, x1, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x0, x0, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + mov x10, #0xffffffff00000001 __LF \ + adds x13, x13, x12, lsl #32 __LF \ + lsr x11, x12, #32 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x12, x10 __LF \ + umulh x12, x12, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x12, x12, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x1, x1, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + adds x14, x14, x13, lsl #32 __LF \ + lsr x11, x13, #32 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x13, x10 __LF \ + umulh x13, x13, x10 __LF \ + adcs x12, x12, x11 __LF \ + adc x13, x13, xzr __LF \ + adds x0, x0, x14, lsl #32 __LF \ + lsr x11, x14, #32 __LF \ + adcs x12, x12, x11 __LF \ + mul x11, x14, x10 __LF \ + umulh x14, x14, x10 __LF \ + adcs x13, x13, x11 __LF \ + adc x14, x14, xzr __LF \ + adds x12, x12, x0, lsl #32 __LF \ + lsr x11, x0, #32 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x0, x10 __LF \ + umulh x0, x0, x10 __LF \ + adcs x14, x14, x11 __LF \ + adc x0, x0, xzr __LF \ + adds x12, x12, x1 __LF \ + adcs x13, x13, x3 __LF \ + adcs x14, x14, x4 __LF \ + adcs x0, x0, x5 __LF \ + cset x8, cs __LF \ + mov x11, #0xffffffff __LF \ + adds x1, x12, #0x1 __LF \ + sbcs x3, x13, x11 __LF \ + sbcs x4, x14, xzr __LF \ + sbcs x5, x0, x10 __LF \ + sbcs xzr, x8, xzr __LF \ + csel x12, x12, x1, cc __LF \ + csel x13, x13, x3, cc __LF \ + csel x14, x14, x4, cc __LF \ + csel x0, x0, x5, cc __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x0, [P0+16] + +// Corresponds exactly to bignum_montsqr_p256_alt + +#define montsqr_p256(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x6, x2, x4 __LF \ + umulh x7, x2, x4 __LF \ + adds x10, x10, x6 __LF \ + adcs x11, x11, x7 __LF \ + mul x6, x3, x4 __LF \ + umulh x7, x3, x4 __LF \ + adc x7, x7, xzr __LF \ + adds x11, x11, x6 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x7 __LF \ + mul x6, x3, x5 __LF \ + umulh x7, x3, x5 __LF \ + adc x7, x7, xzr __LF \ + adds x12, x12, x6 __LF \ + adcs x13, x13, x7 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x7, cs __LF \ + umulh x6, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x6 __LF \ + mul x6, x3, x3 __LF \ + adcs x10, x10, x6 __LF \ + umulh x6, x3, x3 __LF \ + adcs x11, x11, x6 __LF \ + mul x6, x4, x4 __LF \ + adcs x12, x12, x6 __LF \ + umulh x6, x4, x4 __LF \ + adcs x13, x13, x6 __LF \ + mul x6, x5, x5 __LF \ + adcs x14, x14, x6 __LF \ + umulh x6, x5, x5 __LF \ + adc x7, x7, x6 __LF \ + adds x9, x9, x8, lsl #32 __LF \ + lsr x3, x8, #32 __LF \ + adcs x10, x10, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x8, x3 __LF \ + umulh x8, x8, x3 __LF \ + adcs x11, x11, x2 __LF \ + adc x8, x8, xzr __LF \ + adds x10, x10, x9, lsl #32 __LF \ + lsr x3, x9, #32 __LF \ + adcs x11, x11, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x9, x3 __LF \ + umulh x9, x9, x3 __LF \ + adcs x8, x8, x2 __LF \ + adc x9, x9, xzr __LF \ + adds x11, x11, x10, lsl #32 __LF \ + lsr x3, x10, #32 __LF \ + adcs x8, x8, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x10, x3 __LF \ + umulh x10, x10, x3 __LF \ + adcs x9, x9, x2 __LF \ + adc x10, x10, xzr __LF \ + adds x8, x8, x11, lsl #32 __LF \ + lsr x3, x11, #32 __LF \ + adcs x9, x9, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x11, x3 __LF \ + umulh x11, x11, x3 __LF \ + adcs x10, x10, x2 __LF \ + adc x11, x11, xzr __LF \ + adds x8, x8, x12 __LF \ + adcs x9, x9, x13 __LF \ + adcs x10, x10, x14 __LF \ + adcs x11, x11, x7 __LF \ + cset x2, cs __LF \ + mov x3, #0xffffffff __LF \ + mov x5, #0xffffffff00000001 __LF \ + adds x12, x8, #0x1 __LF \ + sbcs x13, x9, x3 __LF \ + sbcs x14, x10, xzr __LF \ + sbcs x7, x11, x5 __LF \ + sbcs xzr, x2, xzr __LF \ + csel x8, x8, x12, cc __LF \ + csel x9, x9, x13, cc __LF \ + csel x10, x10, x14, cc __LF \ + csel x11, x11, x7, cc __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). + +#define amontsqr_p256(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x6, x2, x4 __LF \ + umulh x7, x2, x4 __LF \ + adds x10, x10, x6 __LF \ + adcs x11, x11, x7 __LF \ + mul x6, x3, x4 __LF \ + umulh x7, x3, x4 __LF \ + adc x7, x7, xzr __LF \ + adds x11, x11, x6 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x7 __LF \ + mul x6, x3, x5 __LF \ + umulh x7, x3, x5 __LF \ + adc x7, x7, xzr __LF \ + adds x12, x12, x6 __LF \ + adcs x13, x13, x7 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x7, cs __LF \ + umulh x6, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x6 __LF \ + mul x6, x3, x3 __LF \ + adcs x10, x10, x6 __LF \ + umulh x6, x3, x3 __LF \ + adcs x11, x11, x6 __LF \ + mul x6, x4, x4 __LF \ + adcs x12, x12, x6 __LF \ + umulh x6, x4, x4 __LF \ + adcs x13, x13, x6 __LF \ + mul x6, x5, x5 __LF \ + adcs x14, x14, x6 __LF \ + umulh x6, x5, x5 __LF \ + adc x7, x7, x6 __LF \ + adds x9, x9, x8, lsl #32 __LF \ + lsr x3, x8, #32 __LF \ + adcs x10, x10, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x8, x3 __LF \ + umulh x8, x8, x3 __LF \ + adcs x11, x11, x2 __LF \ + adc x8, x8, xzr __LF \ + adds x10, x10, x9, lsl #32 __LF \ + lsr x3, x9, #32 __LF \ + adcs x11, x11, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x9, x3 __LF \ + umulh x9, x9, x3 __LF \ + adcs x8, x8, x2 __LF \ + adc x9, x9, xzr __LF \ + adds x11, x11, x10, lsl #32 __LF \ + lsr x3, x10, #32 __LF \ + adcs x8, x8, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x10, x3 __LF \ + umulh x10, x10, x3 __LF \ + adcs x9, x9, x2 __LF \ + adc x10, x10, xzr __LF \ + adds x8, x8, x11, lsl #32 __LF \ + lsr x3, x11, #32 __LF \ + adcs x9, x9, x3 __LF \ + mov x3, #0xffffffff00000001 __LF \ + mul x2, x11, x3 __LF \ + umulh x11, x11, x3 __LF \ + adcs x10, x10, x2 __LF \ + adc x11, x11, xzr __LF \ + adds x8, x8, x12 __LF \ + adcs x9, x9, x13 __LF \ + adcs x10, x10, x14 __LF \ + adcs x11, x11, x7 __LF \ + mov x2, #0xffffffffffffffff __LF \ + csel x2, xzr, x2, cc __LF \ + mov x3, #0xffffffff __LF \ + csel x3, xzr, x3, cc __LF \ + mov x5, #0xffffffff00000001 __LF \ + csel x5, xzr, x5, cc __LF \ + subs x8, x8, x2 __LF \ + sbcs x9, x9, x3 __LF \ + sbcs x10, x10, xzr __LF \ + sbc x11, x11, x5 __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Corresponds exactly to bignum_sub_p256 + +#define sub_p256(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + csetm x3, cc __LF \ + adds x5, x5, x3 __LF \ + mov x4, #0xffffffff __LF \ + and x4, x4, x3 __LF \ + adcs x6, x6, x4 __LF \ + adcs x7, x7, xzr __LF \ + mov x4, #0xffffffff00000001 __LF \ + and x4, x4, x3 __LF \ + adc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +S2N_BN_SYMBOL(p256_montjmixadd_alt): + +// Make room on stack for temporary variables +// Move the input arguments to stable places + + sub sp, sp, NSPACE + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_p256(zp2,z_1) + montmul_p256(y2a,z_1,y_2) + + montmul_p256(x2a,zp2,x_2) + montmul_p256(y2a,zp2,y2a) + + sub_p256(xd,x2a,x_1) + sub_p256(yd,y2a,y_1) + + amontsqr_p256(zz,xd) + montsqr_p256(ww,yd) + + montmul_p256(zzx1,zz,x_1) + montmul_p256(zzx2,zz,x2a) + + sub_p256(resx,ww,zzx1) + sub_p256(t1,zzx2,zzx1) + + montmul_p256(resz,xd,z_1) + + sub_p256(resx,resx,zzx2) + + sub_p256(t2,zzx1,resx) + + montmul_p256(t1,t1,y_1) + montmul_p256(t2,yd,t2) + + sub_p256(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + orr x4, x0, x1 + orr x5, x2, x3 + orr x4, x4, x5 + cmp x4, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^256 - p_256), +// hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x12, x13, [x_2] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x2, x3, [resx+16] + ldp x12, x13, [x_2+16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + + ldp x4, x5, [resy] + ldp x12, x13, [y_2] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x6, x7, [resy+16] + ldp x12, x13, [y_2+16] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + + ldp x8, x9, [resz] + mov x12, #0x0000000000000001 + mov x13, #0xffffffff00000000 + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x10, x11, [resz+16] + mov x12, #0xffffffffffffffff + mov x13, #0x00000000fffffffe + csel x10, x10, x12, ne + csel x11, x11, x13, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [y_3] + stp x6, x7, [y_3+16] + stp x8, x9, [z_3] + stp x10, x11, [z_3+16] + +// Restore stack and return + + add sp, sp, NSPACE + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjscalarmul.S similarity index 99% rename from third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjscalarmul.S index 246421ff37d..51f05ac2732 100644 --- a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjscalarmul.S @@ -60,33 +60,33 @@ // which doesn't accept repetitions, assembler macros etc. #define selectblock(I) \ - cmp x14, #(1*I); \ - ldp x12, x13, [x15]; \ - csel x0, x12, x0, eq; \ - csel x1, x13, x1, eq; \ - ldp x12, x13, [x15, #16]; \ - csel x2, x12, x2, eq; \ - csel x3, x13, x3, eq; \ - ldp x12, x13, [x15, #32]; \ - csel x4, x12, x4, eq; \ - csel x5, x13, x5, eq; \ - ldp x12, x13, [x15, #48]; \ - csel x6, x12, x6, eq; \ - csel x7, x13, x7, eq; \ - ldp x12, x13, [x15, #64]; \ - csel x8, x12, x8, eq; \ - csel x9, x13, x9, eq; \ - ldp x12, x13, [x15, #80]; \ - csel x10, x12, x10, eq; \ - csel x11, x13, x11, eq; \ + cmp x14, #(1*I) __LF \ + ldp x12, x13, [x15] __LF \ + csel x0, x12, x0, eq __LF \ + csel x1, x13, x1, eq __LF \ + ldp x12, x13, [x15, #16] __LF \ + csel x2, x12, x2, eq __LF \ + csel x3, x13, x3, eq __LF \ + ldp x12, x13, [x15, #32] __LF \ + csel x4, x12, x4, eq __LF \ + csel x5, x13, x5, eq __LF \ + ldp x12, x13, [x15, #48] __LF \ + csel x6, x12, x6, eq __LF \ + csel x7, x13, x7, eq __LF \ + ldp x12, x13, [x15, #64] __LF \ + csel x8, x12, x8, eq __LF \ + csel x9, x13, x9, eq __LF \ + ldp x12, x13, [x15, #80] __LF \ + csel x10, x12, x10, eq __LF \ + csel x11, x13, x11, eq __LF \ add x15, x15, #96 // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 S2N_BN_SYMBOL(p256_montjscalarmul): diff --git a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjscalarmul_alt.S similarity index 98% rename from third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjscalarmul_alt.S index 8ac5806a725..74ed964f9ae 100644 --- a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_montjscalarmul_alt.S @@ -60,33 +60,33 @@ // which doesn't accept repetitions, assembler macros etc. #define selectblock(I) \ - cmp x14, #(1*I); \ - ldp x12, x13, [x15]; \ - csel x0, x12, x0, eq; \ - csel x1, x13, x1, eq; \ - ldp x12, x13, [x15, #16]; \ - csel x2, x12, x2, eq; \ - csel x3, x13, x3, eq; \ - ldp x12, x13, [x15, #32]; \ - csel x4, x12, x4, eq; \ - csel x5, x13, x5, eq; \ - ldp x12, x13, [x15, #48]; \ - csel x6, x12, x6, eq; \ - csel x7, x13, x7, eq; \ - ldp x12, x13, [x15, #64]; \ - csel x8, x12, x8, eq; \ - csel x9, x13, x9, eq; \ - ldp x12, x13, [x15, #80]; \ - csel x10, x12, x10, eq; \ - csel x11, x13, x11, eq; \ + cmp x14, #(1*I) __LF \ + ldp x12, x13, [x15] __LF \ + csel x0, x12, x0, eq __LF \ + csel x1, x13, x1, eq __LF \ + ldp x12, x13, [x15, #16] __LF \ + csel x2, x12, x2, eq __LF \ + csel x3, x13, x3, eq __LF \ + ldp x12, x13, [x15, #32] __LF \ + csel x4, x12, x4, eq __LF \ + csel x5, x13, x5, eq __LF \ + ldp x12, x13, [x15, #48] __LF \ + csel x6, x12, x6, eq __LF \ + csel x7, x13, x7, eq __LF \ + ldp x12, x13, [x15, #64] __LF \ + csel x8, x12, x8, eq __LF \ + csel x9, x13, x9, eq __LF \ + ldp x12, x13, [x15, #80] __LF \ + csel x10, x12, x10, eq __LF \ + csel x11, x13, x11, eq __LF \ add x15, x15, #96 // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 S2N_BN_SYMBOL(p256_montjscalarmul_alt): diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmul.S new file mode 100644 index 00000000000..317008a461d --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmul.S @@ -0,0 +1,8575 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Scalar multiplication for P-256 +// Input scalar[4], point[8]; output res[8] +// +// extern void p256_scalarmul +// (uint64_t res[static 8], +// uint64_t scalar[static 4], +// uint64_t point[static 8]); +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-256, returns the point (X,Y) = n * P. The input and output +// are affine points, and in the case of the point at infinity as +// the result, (0,0) is returned. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmul) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Safe copies of inputs (res lasts the whole code, point not so long) +// and additional values in variables, with some aliasing + +#define res x19 +#define sgn x20 +#define j x20 +#define point x21 + +// Intermediate variables on the stack. The last z2, z3 values can +// safely be overlaid on the table, which is no longer needed at the end. + +#define scalarb sp, #(0*NUMSIZE) +#define acc sp, #(1*NUMSIZE) +#define tabent sp, #(4*NUMSIZE) + +#define tab sp, #(7*NUMSIZE) + +#define z2 sp, #(7*NUMSIZE) +#define z3 sp, #(8*NUMSIZE) + +#define NSPACE #(31*NUMSIZE) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(p256_scalarmul): + + stp x19, x20, [sp, #-16]! + stp x21, x30, [sp, #-16]! + sub sp, sp, NSPACE + +// Preserve the "res" and "point" input arguments. We load and process the +// scalar immediately so we don't bother preserving that input argument. +// Also, "point" is only needed early on and so its register gets re-used. + + mov res, x0 + mov point, x2 + +// Load the digits of group order n_256 = [x12;x13;x14;x15] + + movbig(x12, #0xf3b9, #0xcac2, #0xfc63, #0x2551) + movbig(x13, #0xbce6, #0xfaad, #0xa717, #0x9e84) + mov x14, #0xffffffffffffffff + mov x15, #0xffffffff00000000 + +// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256 + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + + subs x6, x2, x12 + sbcs x7, x3, x13 + sbcs x8, x4, x14 + sbcs x9, x5, x15 + + csel x2, x2, x6, cc + csel x3, x3, x7, cc + csel x4, x4, x8, cc + csel x5, x5, x9, cc + +// Now if the top bit of the reduced scalar is set, negate it mod n_256, +// i.e. do n |-> n_256 - n. Remember the sign as "sgn" so we can +// correspondingly negate the point below. + + subs x6, x12, x2 + sbcs x7, x13, x3 + sbcs x8, x14, x4 + sbc x9, x15, x5 + + tst x5, #0x8000000000000000 + csel x2, x2, x6, eq + csel x3, x3, x7, eq + csel x4, x4, x8, eq + csel x5, x5, x9, eq + cset sgn, ne + +// In either case then add the recoding constant 0x08888...888 to allow +// signed digits. + + mov x6, 0x8888888888888888 + adds x2, x2, x6 + adcs x3, x3, x6 + bic x7, x6, #0xF000000000000000 + adcs x4, x4, x6 + adc x5, x5, x7 + + stp x2, x3, [scalarb] + stp x4, x5, [scalarb+16] + +// Set the tab[0] table entry to Montgomery-Jacobian point = 1 * P +// The z coordinate is just the Montgomery form of the constant 1. + + add x0, tab + mov x1, point + bl p256_scalarmul_local_tomont_p256 + + add x1, point, #32 + add x0, tab+32 + bl p256_scalarmul_local_tomont_p256 + + mov x0, #0x0000000000000001 + mov x1, #0xffffffff00000000 + stp x0, x1, [tab+64] + mov x2, #0xffffffffffffffff + mov x3, #0x00000000fffffffe + stp x2, x3, [tab+80] + +// If the top bit of the scalar was set, negate (y coordinate of) the point + + ldp x4, x5, [tab+32] + ldp x6, x7, [tab+48] + + mov x0, 0xffffffffffffffff + subs x0, x0, x4 + mov x1, 0x00000000ffffffff + sbcs x1, x1, x5 + mov x3, 0xffffffff00000001 + sbcs x2, xzr, x6 + sbc x3, x3, x7 + + cmp sgn, xzr + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tab+32] + stp x6, x7, [tab+48] + +// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P + + add x0, tab+96*1 + add x1, tab + bl p256_scalarmul_local_p256_montjdouble + + add x0, tab+96*2 + add x1, tab+96*1 + add x2, tab + bl p256_scalarmul_local_p256_montjmixadd + + add x0, tab+96*3 + add x1, tab+96*1 + bl p256_scalarmul_local_p256_montjdouble + + add x0, tab+96*4 + add x1, tab+96*3 + add x2, tab + bl p256_scalarmul_local_p256_montjmixadd + + add x0, tab+96*5 + add x1, tab+96*2 + bl p256_scalarmul_local_p256_montjdouble + + add x0, tab+96*6 + add x1, tab+96*5 + add x2, tab + bl p256_scalarmul_local_p256_montjmixadd + + add x0, tab+96*7 + add x1, tab+96*3 + bl p256_scalarmul_local_p256_montjdouble + +// Initialize the accumulator as a table entry for top 4 bits (unrecoded) + + ldr x14, [scalarb+24] + lsr x14, x14, #60 + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + add x15, tab + + .set i, 1 +.rep 8 + cmp x14, #i + ldp x12, x13, [x15] + csel x0, x12, x0, eq + csel x1, x13, x1, eq + ldp x12, x13, [x15, #16] + csel x2, x12, x2, eq + csel x3, x13, x3, eq + ldp x12, x13, [x15, #32] + csel x4, x12, x4, eq + csel x5, x13, x5, eq + ldp x12, x13, [x15, #48] + csel x6, x12, x6, eq + csel x7, x13, x7, eq + ldp x12, x13, [x15, #64] + csel x8, x12, x8, eq + csel x9, x13, x9, eq + ldp x12, x13, [x15, #80] + csel x10, x12, x10, eq + csel x11, x13, x11, eq + add x15, x15, #96 + .set i, (i+1) +.endr + stp x0, x1, [acc] + stp x2, x3, [acc+16] + stp x4, x5, [acc+32] + stp x6, x7, [acc+48] + stp x8, x9, [acc+64] + stp x10, x11, [acc+80] + + mov j, #252 + +// Main loop over size-4 bitfields: double 4 times then add signed digit + +p256_scalarmul_loop: + sub j, j, #4 + + add x0, acc + add x1, acc + bl p256_scalarmul_local_p256_montjdouble + + add x0, acc + add x1, acc + bl p256_scalarmul_local_p256_montjdouble + + add x0, acc + add x1, acc + bl p256_scalarmul_local_p256_montjdouble + + add x0, acc + add x1, acc + bl p256_scalarmul_local_p256_montjdouble + + lsr x2, j, #6 + ldr x14, [sp, x2, lsl #3] // Exploits scalarb = sp exactly + lsr x14, x14, j + and x14, x14, #15 + + subs x14, x14, #8 + cset x16, lo // x16 = sign of digit (1 = negative) + cneg x14, x14, lo // x14 = absolute value of digit + +// Conditionally select the table entry tab[i-1] = i * P in constant time + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + add x15, tab + .set i, 1 +.rep 8 + cmp x14, #i + ldp x12, x13, [x15] + csel x0, x12, x0, eq + csel x1, x13, x1, eq + ldp x12, x13, [x15, #16] + csel x2, x12, x2, eq + csel x3, x13, x3, eq + ldp x12, x13, [x15, #32] + csel x4, x12, x4, eq + csel x5, x13, x5, eq + ldp x12, x13, [x15, #48] + csel x6, x12, x6, eq + csel x7, x13, x7, eq + ldp x12, x13, [x15, #64] + csel x8, x12, x8, eq + csel x9, x13, x9, eq + ldp x12, x13, [x15, #80] + csel x10, x12, x10, eq + csel x11, x13, x11, eq + add x15, x15, #96 + .set i, (i+1) +.endr + +// Store it to "tabent" with the y coordinate optionally negated + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + + mov x0, 0xffffffffffffffff + subs x0, x0, x4 + mov x1, 0x00000000ffffffff + sbcs x1, x1, x5 + mov x3, 0xffffffff00000001 + sbcs x2, xzr, x6 + sbc x3, x3, x7 + + cmp x16, xzr + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + stp x8, x9, [tabent+64] + stp x10, x11, [tabent+80] + + add x0, acc + add x1, acc + add x2, tabent + bl p256_scalarmul_local_p256_montjadd + + cbnz j, p256_scalarmul_loop + +// That's the end of the main loop, and we just need to translate +// back from the Jacobian representation to affine. First of all, +// let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form + + add x0, z2 + add x1, acc+64 + bl p256_scalarmul_local_montsqr_p256 + + add x0, z3 + add x2, z2 + add x1, acc+64 + bl p256_scalarmul_local_montmul_p256 + + add x0, z2 + add x1, z3 + bl p256_scalarmul_local_demont_p256 + + add x0, z3 + add x1, z2 + bl p256_scalarmul_local_inv_p256 + + add x0, z2 + add x2, z3 + add x1, acc+64 + bl p256_scalarmul_local_montmul_p256 + +// Convert back from Jacobian (X,Y,Z) |-> (X/Z^2, Y/Z^3) + + add x1, acc + add x2, z2 + mov x0, res + bl p256_scalarmul_local_montmul_p256 + + add x0, res, #32 + add x1, acc+32 + add x2, z3 + bl p256_scalarmul_local_montmul_p256 + +// Restore stack and registers and return + + add sp, sp, NSPACE + ldp x21, x30, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// Local copies of subroutines, complete clones at the moment + +p256_scalarmul_local_demont_p256: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + lsl x7, x2, #32 + subs x8, x2, x7 + lsr x6, x2, #32 + sbc x2, x2, x6 + adds x3, x3, x7 + adcs x4, x4, x6 + adcs x5, x5, x8 + adc x2, x2, xzr + lsl x7, x3, #32 + subs x8, x3, x7 + lsr x6, x3, #32 + sbc x3, x3, x6 + adds x4, x4, x7 + adcs x5, x5, x6 + adcs x2, x2, x8 + adc x3, x3, xzr + lsl x7, x4, #32 + subs x8, x4, x7 + lsr x6, x4, #32 + sbc x4, x4, x6 + adds x5, x5, x7 + adcs x2, x2, x6 + adcs x3, x3, x8 + adc x4, x4, xzr + lsl x7, x5, #32 + subs x8, x5, x7 + lsr x6, x5, #32 + sbc x5, x5, x6 + adds x2, x2, x7 + adcs x3, x3, x6 + adcs x4, x4, x8 + adc x5, x5, xzr + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + +p256_scalarmul_local_inv_p256: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + sub sp, sp, #0xa0 + mov x20, x0 + mov x10, #0xffffffffffffffff + mov x11, #0xffffffff + mov x13, #0xffffffff00000001 + stp x10, x11, [sp] + stp xzr, x13, [sp, #16] + str xzr, [sp, #32] + ldp x2, x3, [x1] + subs x10, x2, x10 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #16] + sbcs x12, x4, xzr + sbcs x13, x5, x13 + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + stp x2, x3, [sp, #48] + stp x4, x5, [sp, #64] + str xzr, [sp, #80] + stp xzr, xzr, [sp, #96] + stp xzr, xzr, [sp, #112] + mov x10, #0x4000000000000 + stp x10, xzr, [sp, #128] + stp xzr, xzr, [sp, #144] + mov x21, #0xa + mov x22, #0x1 + b p256_scalarmul_inv_midloop +p256_scalarmul_inv_loop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #48] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #64] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #56] + ldr x7, [sp, #24] + eor x1, x7, x14 + ldr x23, [sp, #32] + eor x3, x23, x14 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #72] + eor x1, x8, x15 + ldr x24, [sp, #80] + eor x0, x24, x15 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + asr x3, x3, #59 + str x3, [sp, #32] + eor x1, x7, x16 + eor x5, x23, x16 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + eor x0, x24, x17 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #64] + extr x2, x5, x2, #59 + str x2, [sp, #72] + asr x5, x5, #59 + str x5, [sp, #80] + ldr x7, [sp, #96] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #128] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #96] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #128] + adc x3, x3, x1 + ldr x7, [sp, #104] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #136] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #104] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #136] + adc x4, x4, x1 + ldr x7, [sp, #112] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #144] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #112] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #144] + adc x2, x2, x1 + ldr x7, [sp, #120] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #152] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + ldp x0, x1, [sp, #96] + ldr x6, [sp, #112] + mov x14, #0xe000000000000000 + adds x0, x0, x14 + sbcs x1, x1, xzr + mov x11, #0x1fffffff + adcs x6, x6, x11 + mov x10, #0x2000000000000000 + adcs x5, x5, x10 + mov x14, #0x1fffffffe0000000 + adc x3, x3, x14 + lsl x11, x0, #32 + subs x14, x0, x11 + lsr x10, x0, #32 + sbc x0, x0, x10 + adds x1, x1, x11 + adcs x6, x6, x10 + adcs x5, x5, x14 + adcs x3, x3, x0 + mov x14, #0xffffffffffffffff + mov x11, #0xffffffff + mov x10, #0xffffffff00000001 + csel x14, x14, xzr, cs + csel x11, x11, xzr, cs + csel x10, x10, xzr, cs + subs x1, x1, x14 + sbcs x6, x6, x11 + sbcs x5, x5, xzr + sbc x3, x3, x10 + stp x1, x6, [sp, #96] + stp x5, x3, [sp, #112] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + ldp x0, x1, [sp, #128] + ldr x3, [sp, #144] + mov x14, #0xe000000000000000 + adds x0, x0, x14 + sbcs x1, x1, xzr + mov x11, #0x1fffffff + adcs x3, x3, x11 + mov x10, #0x2000000000000000 + adcs x2, x2, x10 + mov x14, #0x1fffffffe0000000 + adc x5, x5, x14 + lsl x11, x0, #32 + subs x14, x0, x11 + lsr x10, x0, #32 + sbc x0, x0, x10 + adds x1, x1, x11 + adcs x3, x3, x10 + adcs x2, x2, x14 + adcs x5, x5, x0 + mov x14, #0xffffffffffffffff + mov x11, #0xffffffff + mov x10, #0xffffffff00000001 + csel x14, x14, xzr, cs + csel x11, x11, xzr, cs + csel x10, x10, xzr, cs + subs x1, x1, x14 + sbcs x3, x3, x11 + sbcs x2, x2, xzr + sbc x5, x5, x10 + stp x1, x3, [sp, #128] + stp x2, x5, [sp, #144] +p256_scalarmul_inv_midloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #48] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne p256_scalarmul_inv_loop + ldr x0, [sp] + ldr x1, [sp, #48] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #96] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #128] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #96] + adc x2, x2, x1 + ldr x7, [sp, #104] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #136] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #104] + adc x6, x6, x1 + ldr x7, [sp, #112] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #144] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #112] + adc x5, x5, x1 + ldr x7, [sp, #120] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #152] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + ldp x0, x1, [sp, #96] + ldr x2, [sp, #112] + mov x14, #0xe000000000000000 + adds x0, x0, x14 + sbcs x1, x1, xzr + mov x11, #0x1fffffff + adcs x2, x2, x11 + mov x10, #0x2000000000000000 + adcs x5, x5, x10 + mov x14, #0x1fffffffe0000000 + adc x3, x3, x14 + lsl x11, x0, #32 + subs x14, x0, x11 + lsr x10, x0, #32 + sbc x0, x0, x10 + adds x1, x1, x11 + adcs x2, x2, x10 + adcs x5, x5, x14 + adcs x3, x3, x0 + mov x14, #0xffffffffffffffff + mov x11, #0xffffffff + mov x10, #0xffffffff00000001 + csel x14, x14, xzr, cs + csel x11, x11, xzr, cs + csel x10, x10, xzr, cs + subs x1, x1, x14 + sbcs x2, x2, x11 + sbcs x5, x5, xzr + sbc x3, x3, x10 + mov x10, #0xffffffffffffffff + subs x10, x1, x10 + mov x11, #0xffffffff + sbcs x11, x2, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x5, xzr + sbcs x13, x3, x13 + csel x10, x1, x10, cc + csel x11, x2, x11, cc + csel x12, x5, x12, cc + csel x13, x3, x13, cc + stp x10, x11, [x20] + stp x12, x13, [x20, #16] + add sp, sp, #0xa0 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +p256_scalarmul_local_montmul_p256: + ldr q20, [x2] + ldp x7, x17, [x1] + ldr q0, [x1] + ldp x6, x10, [x2] + ldp x11, x15, [x1, #16] + rev64 v16.4S, v20.4S + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4S, v16.4S, v0.4S + umulh x12, x17, x10 + uzp1 v28.4S, v20.4S, v0.4S + subs x14, x11, x7 + ldr q20, [x2, #16] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2D, v16.4S + umulh x4, x7, x6 + uzp1 v21.4S, v0.4S, v0.4S + cneg x11, x8, cc + shl v17.2D, v27.2D, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2D, v21.2S, v28.2S + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x1, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2S, v20.2D + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4S, v20.4S, v20.4S + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2S, v28.2D + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x2, #16] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x9, x3, x13 + adcs x3, x8, x7 + umulh x8, x14, x11 + umull v21.2D, v0.2S, v1.2S + adcs x12, x10, x12 + umull v3.2D, v0.2S, v16.2S + adc x15, x15, xzr + rev64 v24.4S, v20.4S + stp x12, x15, [x0, #16] + movi v2.2D, #0x00000000ffffffff + mul x10, x14, x11 + mul v4.4S, v24.4S, v28.4S + subs x13, x14, x5 + uzp2 v19.4S, v28.4S, v28.4S + csetm x15, cc + usra v3.2D, v21.2D, #32 + mul x7, x5, x1 + umull v21.2D, v19.2S, v16.2S + cneg x13, x13, cc + uaddlp v5.2D, v4.4S + subs x11, x1, x11 + and v16.16B, v3.16B, v2.16B + umulh x5, x5, x1 + shl v24.2D, v5.2D, #32 + cneg x11, x11, cc + umlal v16.2D, v19.2S, v1.2S + cinv x12, x15, cc + umlal v24.2D, v0.2S, v1.2S + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + stp x9, x3, [x0] + usra v21.2D, v3.2D, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2D, v16.2D, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + ldp x15, x8, [x0] + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + ldp x9, x13, [x0, #16] + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x15 + adcs x15, x16, x8 + eor x5, x17, x4 + adcs x9, x1, x9 + eor x1, x10, x5 + adcs x16, x2, x13 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [x0] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [x0, #16] + ret + +p256_scalarmul_local_montsqr_p256: + ldr q19, [x1] + ldp x9, x13, [x1] + ldr q23, [x1, #16] + ldr q0, [x1] + ldp x1, x10, [x1, #16] + uzp2 v29.4S, v19.4S, v19.4S + xtn v4.2S, v19.2D + umulh x8, x9, x13 + rev64 v20.4S, v23.4S + umull v16.2D, v19.2S, v19.2S + umull v1.2D, v29.2S, v4.2S + mul v20.4S, v20.4S, v0.4S + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2D, v19.4S, v19.4S + mov x4, v16.d[0] + uzp1 v17.4S, v23.4S, v0.4S + uaddlp v19.2D, v20.4S + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4S, v0.4S, v0.4S + shl v19.2D, v19.2D, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2D, v20.2S, v17.2S + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x2, x5, x2, cs + stp x14, x12, [x0, #16] + stp x16, x2, [x0] + ret + +p256_scalarmul_local_tomont_p256: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x1, #0xffffffffffffffff + mov x7, #0xffffffff + mov x9, #0xffffffff00000001 + subs x1, x2, x1 + sbcs x7, x3, x7 + sbcs x8, x4, xzr + sbcs x9, x5, x9 + csel x2, x2, x1, cc + csel x3, x3, x7, cc + csel x4, x4, x8, cc + csel x5, x5, x9, cc + cmp xzr, xzr + extr x9, x5, x4, #32 + adcs xzr, x4, x9 + lsr x9, x5, #32 + adcs x9, x5, x9 + csetm x6, cs + orr x9, x9, x6 + lsl x7, x9, #32 + lsr x8, x9, #32 + adds x4, x4, x7 + adc x5, x5, x8 + negs x6, x9 + sbcs x7, x7, xzr + sbc x8, x8, xzr + negs x6, x6 + sbcs x2, x2, x7 + sbcs x3, x3, x8 + sbcs x4, x4, x9 + sbcs x5, x5, x9 + adds x6, x6, x5 + mov x7, #0xffffffff + and x7, x7, x5 + adcs x2, x2, x7 + adcs x3, x3, xzr + mov x7, #0xffffffff00000001 + and x7, x7, x5 + adc x4, x4, x7 + cmp xzr, xzr + extr x9, x4, x3, #32 + adcs xzr, x3, x9 + lsr x9, x4, #32 + adcs x9, x4, x9 + csetm x5, cs + orr x9, x9, x5 + lsl x7, x9, #32 + lsr x8, x9, #32 + adds x3, x3, x7 + adc x4, x4, x8 + negs x5, x9 + sbcs x7, x7, xzr + sbc x8, x8, xzr + negs x5, x5 + sbcs x6, x6, x7 + sbcs x2, x2, x8 + sbcs x3, x3, x9 + sbcs x4, x4, x9 + adds x5, x5, x4 + mov x7, #0xffffffff + and x7, x7, x4 + adcs x6, x6, x7 + adcs x2, x2, xzr + mov x7, #0xffffffff00000001 + and x7, x7, x4 + adc x3, x3, x7 + cmp xzr, xzr + extr x9, x3, x2, #32 + adcs xzr, x2, x9 + lsr x9, x3, #32 + adcs x9, x3, x9 + csetm x4, cs + orr x9, x9, x4 + lsl x7, x9, #32 + lsr x8, x9, #32 + adds x2, x2, x7 + adc x3, x3, x8 + negs x4, x9 + sbcs x7, x7, xzr + sbc x8, x8, xzr + negs x4, x4 + sbcs x5, x5, x7 + sbcs x6, x6, x8 + sbcs x2, x2, x9 + sbcs x3, x3, x9 + adds x4, x4, x3 + mov x7, #0xffffffff + and x7, x7, x3 + adcs x5, x5, x7 + adcs x6, x6, xzr + mov x7, #0xffffffff00000001 + and x7, x7, x3 + adc x2, x2, x7 + cmp xzr, xzr + extr x9, x2, x6, #32 + adcs xzr, x6, x9 + lsr x9, x2, #32 + adcs x9, x2, x9 + csetm x3, cs + orr x9, x9, x3 + lsl x7, x9, #32 + lsr x8, x9, #32 + adds x6, x6, x7 + adc x2, x2, x8 + negs x3, x9 + sbcs x7, x7, xzr + sbc x8, x8, xzr + negs x3, x3 + sbcs x4, x4, x7 + sbcs x5, x5, x8 + sbcs x6, x6, x9 + sbcs x2, x2, x9 + adds x3, x3, x2 + mov x7, #0xffffffff + and x7, x7, x2 + adcs x4, x4, x7 + adcs x5, x5, xzr + mov x7, #0xffffffff00000001 + and x7, x7, x2 + adc x6, x6, x7 + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ret + +p256_scalarmul_local_p256_montjadd: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x30, [sp, #-16]! + sub sp, sp, #0xe0 + mov x21, x0 + mov x22, x1 + mov x23, x2 + mov x0, sp + ldr q19, [x22, #64] + ldp x9, x13, [x22, #64] + ldr q23, [x22, #80] + ldr q0, [x22, #64] + ldp x1, x10, [x22, #80] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc // cc = lo, ul, last + cneg x6, x14, cc // cc = lo, ul, last + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc // cc = lo, ul, last + cinv x2, x5, cc // cc = lo, ul, last + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff // #4294967295 + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 // #-4294967295 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x19, x3, x16, cs // cs = hs, nlast + csel x14, x8, x14, cs // cs = hs, nlast + csel x12, x11, x12, cs // cs = hs, nlast + csel x20, x5, x2, cs // cs = hs, nlast + stp x14, x12, [x0, #16] + stp x19, x20, [x0] + ldr q19, [x23, #64] + ldp x9, x13, [x23, #64] + ldr q23, [x23, #80] + ldr q0, [x23, #64] + ldp x1, x10, [x23, #80] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc // cc = lo, ul, last + cneg x6, x14, cc // cc = lo, ul, last + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc // cc = lo, ul, last + cinv x2, x5, cc // cc = lo, ul, last + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff // #4294967295 + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 // #-4294967295 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs // cs = hs, nlast + csel x14, x8, x14, cs // cs = hs, nlast + csel x12, x11, x12, cs // cs = hs, nlast + csel x2, x5, x2, cs // cs = hs, nlast + stp x14, x12, [sp, #176] + stp x16, x2, [sp, #160] + ldr q20, [x22, #32] + ldp x7, x17, [x23, #64] + ldr q0, [x23, #64] + ldp x6, x10, [x22, #32] + ldp x11, x15, [x23, #80] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x22, #48] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [x23, #80] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x22, #48] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #192] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #208] + ldr q20, [x23, #32] + ldp x7, x17, [x22, #64] + ldr q0, [x22, #64] + ldp x6, x10, [x23, #32] + ldp x11, x15, [x22, #80] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x23, #48] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [x22, #80] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x23, #48] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x24, x11, x13 + and x1, x1, x13 + adcs x25, x4, x1 + and x1, x12, x13 + stp x24, x25, [sp, #32] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #48] + mov x1, sp + ldr q20, [x23] + ldr q0, [x1] + ldp x6, x10, [x23] + ldp x11, x15, [x1, #16] + rev64 v16.4s, v20.4s + subs x4, x19, x20 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x20, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x19 + ldr q20, [x23, #16] + sbcs x5, x15, x20 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x19, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [x1, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x23, #16] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #64] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #80] + ldr q20, [x22] + ldp x7, x17, [sp, #160] + ldr q0, [sp, #160] + ldp x6, x10, [x22] + ldp x11, x15, [sp, #176] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x22, #16] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x22, #16] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #128] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #144] + mov x1, sp + ldr q20, [sp, #32] + ldp x7, x17, [x1] + ldr q0, [x1] + ldp x11, x15, [x1, #16] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x25 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #48] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x24 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x25, x24 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [x1, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #48] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x24, x7 + sbcs x9, x25, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x24, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x25, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x24 + eor x1, x10, x5 + adcs x16, x2, x25 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x19, x11, x13 + and x1, x1, x13 + adcs x20, x4, x1 + and x1, x12, x13 + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #48] + ldr q20, [sp, #192] + ldp x7, x17, [sp, #160] + ldr q0, [sp, #160] + ldp x6, x10, [sp, #192] + ldp x11, x15, [sp, #176] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #208] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #208] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x9, x11, x13 + and x1, x1, x13 + adcs x10, x4, x1 + and x1, x12, x13 + stp x9, x10, [sp, #192] + adcs x11, x7, xzr + adc x12, x17, x1 + stp x11, x12, [sp, #208] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc // cc = lo, ul, last + adds x13, x5, x3 + and x4, x3, #0xffffffff + adcs x24, x6, x4 + adcs x25, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x26, x8, x4 + stp x13, x24, [sp, #160] + stp x25, x26, [sp, #176] + subs x5, x19, x9 + sbcs x6, x20, x10 + ldp x7, x8, [sp, #48] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + csetm x3, cc // cc = lo, ul, last + adds x19, x5, x3 + and x4, x3, #0xffffffff + adcs x20, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x19, x20, [sp, #32] + stp x7, x8, [sp, #48] + ldr q19, [sp, #160] + ldr q23, [sp, #176] + ldr q0, [sp, #160] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x13, x24 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x13, x24 + umulh x15, x13, x25 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x13, x24 + mov x12, v1.d[0] + csetm x5, cc // cc = lo, ul, last + cneg x6, x14, cc // cc = lo, ul, last + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x26, x25 + mov x9, v1.d[1] + cneg x17, x2, cc // cc = lo, ul, last + cinv x2, x5, cc // cc = lo, ul, last + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x24, x26 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x25, x26 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x25, x26 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x26, x26 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x26, x26 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x25, x25 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x25, x25 + adcs x2, x2, x9 + mov x6, #0xffffffff // #4294967295 + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 // #-4294967295 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x24, x3, x16, cs // cs = hs, nlast + csel x25, x8, x14, cs // cs = hs, nlast + csel x26, x11, x12, cs // cs = hs, nlast + csel x27, x5, x2, cs // cs = hs, nlast + stp x25, x26, [sp, #112] + stp x24, x27, [sp, #96] + mov x0, sp + ldr q19, [sp, #32] + ldr q23, [sp, #48] + ldr q0, [sp, #32] + ldp x1, x10, [sp, #48] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x19, x20 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x19, x20 + umulh x15, x19, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x19, x20 + mov x12, v1.d[0] + csetm x5, cc // cc = lo, ul, last + cneg x6, x14, cc // cc = lo, ul, last + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc // cc = lo, ul, last + cinv x2, x5, cc // cc = lo, ul, last + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x20, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff // #4294967295 + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 // #-4294967295 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs // cs = hs, nlast + csel x14, x8, x14, cs // cs = hs, nlast + csel x12, x11, x12, cs // cs = hs, nlast + csel x2, x5, x2, cs // cs = hs, nlast + stp x14, x12, [x0, #16] + stp x16, x2, [x0] + ldr q20, [sp, #128] + ldr q0, [sp, #96] + ldp x6, x10, [sp, #128] + rev64 v16.4s, v20.4s + subs x4, x24, x27 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x27, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x25, x24 + ldr q20, [sp, #144] + sbcs x5, x26, x27 + ngc x17, xzr + subs x8, x25, x26 + uaddlp v27.2d, v16.4s + umulh x4, x24, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [sp, #112] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #144] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x25, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x26, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x25 + eor x1, x10, x5 + adcs x16, x2, x26 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x19, x11, x13 + and x1, x1, x13 + adcs x20, x4, x1 + and x1, x12, x13 + stp x19, x20, [sp, #128] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #144] + ldr q20, [sp, #64] + ldr q0, [sp, #96] + ldp x6, x10, [sp, #64] + ldp x11, x15, [sp, #112] + rev64 v16.4s, v20.4s + subs x4, x24, x27 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x27, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x24 + ldr q20, [sp, #80] + sbcs x5, x15, x27 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x24, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [sp, #112] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #80] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x9, x11, x13 + and x1, x1, x13 + adcs x10, x4, x1 + and x1, x12, x13 + stp x9, x10, [sp, #64] + adcs x11, x7, xzr + adc x12, x17, x1 + stp x11, x12, [sp, #80] + mov x0, sp + mov x1, sp + ldp x5, x6, [x1] + subs x5, x5, x19 + sbcs x6, x6, x20 + ldp x7, x8, [x1, #16] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc // cc = lo, ul, last + adds x24, x5, x3 + and x4, x3, #0xffffffff + adcs x25, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x7, x8, [x0, #16] + subs x5, x9, x19 + sbcs x6, x10, x20 + ldp x4, x3, [sp, #144] + sbcs x7, x11, x4 + sbcs x8, x12, x3 + csetm x3, cc // cc = lo, ul, last + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #96] + stp x7, x8, [sp, #112] + ldr q20, [x22, #64] + ldp x7, x17, [sp, #160] + ldr q0, [sp, #160] + ldp x6, x10, [x22, #64] + ldp x11, x15, [sp, #176] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x22, #80] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x22, #80] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #160] + adcs x19, x7, xzr + adc x20, x17, x1 + stp x19, x20, [sp, #176] + mov x0, sp + mov x1, sp + ldp x4, x3, [sp, #64] + subs x5, x24, x4 + sbcs x6, x25, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [sp, #80] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc // cc = lo, ul, last + adds x9, x5, x3 + and x4, x3, #0xffffffff + adcs x10, x6, x4 + adcs x11, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x3, x8, x4 + stp x9, x10, [x0] + stp x11, x3, [x0, #16] + ldp x5, x6, [sp, #128] + subs x5, x5, x9 + sbcs x6, x6, x10 + ldp x7, x8, [sp, #144] + sbcs x7, x7, x11 + sbcs x8, x8, x3 + csetm x3, cc // cc = lo, ul, last + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldr q20, [sp, #192] + ldp x7, x17, [sp, #96] + ldr q0, [sp, #96] + ldp x6, x10, [sp, #192] + ldp x11, x15, [sp, #112] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #208] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [sp, #112] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #208] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #96] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #112] + ldr q20, [x23, #64] + ldp x7, x17, [sp, #160] + ldr q0, [sp, #160] + ldp x6, x10, [x23, #64] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x19, x7 + ldr q20, [x23, #80] + sbcs x5, x20, x17 + ngc x17, xzr + subs x8, x19, x20 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x23, #80] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x24, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x25, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x24 + eor x1, x10, x5 + adcs x16, x2, x25 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x19, x11, x13 + and x1, x1, x13 + adcs x20, x4, x1 + and x1, x12, x13 + stp x19, x20, [sp, #160] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #176] + ldr q20, [sp, #128] + ldp x7, x17, [sp, #32] + ldr q0, [sp, #32] + ldp x6, x10, [sp, #128] + ldp x11, x15, [sp, #48] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #144] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [sp, #48] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #144] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x5, x11, x13 + and x1, x1, x13 + adcs x6, x4, x1 + and x1, x12, x13 + adcs x7, x7, xzr + adc x9, x17, x1 + ldp x4, x3, [sp, #96] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x4, x3, [sp, #112] + sbcs x7, x7, x4 + sbcs x8, x9, x3 + csetm x3, cc // cc = lo, ul, last + adds x15, x5, x3 + and x4, x3, #0xffffffff + adcs x24, x6, x4 + adcs x25, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x26, x8, x4 + stp x15, x24, [sp, #128] + stp x25, x26, [sp, #144] + ldp x0, x1, [x22, #64] + ldp x2, x3, [x22, #80] + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne // ne = any + ldp x4, x5, [x23, #64] + ldp x6, x7, [x23, #80] + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne // ne = any + cmp x13, x12 + csel x8, x0, x19, cc // cc = lo, ul, last + csel x9, x1, x20, cc // cc = lo, ul, last + csel x8, x4, x8, hi // hi = pmore + csel x9, x5, x9, hi // hi = pmore + ldp x10, x11, [sp, #176] + csel x10, x2, x10, cc // cc = lo, ul, last + csel x11, x3, x11, cc // cc = lo, ul, last + csel x10, x6, x10, hi // hi = pmore + csel x11, x7, x11, hi // hi = pmore + ldp x12, x13, [x22] + ldp x0, x1, [sp] + csel x0, x12, x0, cc // cc = lo, ul, last + csel x1, x13, x1, cc // cc = lo, ul, last + ldp x12, x13, [x23] + csel x0, x12, x0, hi // hi = pmore + csel x1, x13, x1, hi // hi = pmore + ldp x12, x13, [x22, #16] + ldp x2, x3, [sp, #16] + csel x2, x12, x2, cc // cc = lo, ul, last + csel x3, x13, x3, cc // cc = lo, ul, last + ldp x12, x13, [x23, #16] + csel x2, x12, x2, hi // hi = pmore + csel x3, x13, x3, hi // hi = pmore + ldp x12, x13, [x22, #32] + csel x4, x12, x15, cc // cc = lo, ul, last + csel x5, x13, x24, cc // cc = lo, ul, last + ldp x12, x13, [x23, #32] + csel x4, x12, x4, hi // hi = pmore + csel x5, x13, x5, hi // hi = pmore + ldp x12, x13, [x22, #48] + csel x6, x12, x25, cc // cc = lo, ul, last + csel x7, x13, x26, cc // cc = lo, ul, last + ldp x12, x13, [x23, #48] + csel x6, x12, x6, hi // hi = pmore + csel x7, x13, x7, hi // hi = pmore + stp x0, x1, [x21] + stp x2, x3, [x21, #16] + stp x4, x5, [x21, #32] + stp x6, x7, [x21, #48] + stp x8, x9, [x21, #64] + stp x10, x11, [x21, #80] + add sp, sp, #0xe0 + ldp x27, x30, [sp], #16 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +p256_scalarmul_local_p256_montjdouble: + sub sp, sp, #0x110 + stp x19, x20, [sp, #192] + stp x21, x22, [sp, #208] + stp x23, x24, [sp, #224] + stp x25, x26, [sp, #240] + stp x27, xzr, [sp, #256] + mov x19, x0 + mov x20, x1 + mov x0, sp + ldr q19, [x20, #64] + ldp x9, x13, [x20, #64] + ldr q23, [x20, #80] + ldr q0, [x20, #64] + ldp x1, x10, [x20, #80] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc // cc = lo, ul, last + cneg x6, x14, cc // cc = lo, ul, last + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc // cc = lo, ul, last + cinv x2, x5, cc // cc = lo, ul, last + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff // #4294967295 + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 // #-4294967295 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x21, x3, x16, cs // cs = hs, nlast + csel x22, x8, x14, cs // cs = hs, nlast + csel x23, x11, x12, cs // cs = hs, nlast + csel x24, x5, x2, cs // cs = hs, nlast + stp x22, x23, [x0, #16] + stp x21, x24, [x0] + ldr q19, [x20, #32] + ldp x9, x13, [x20, #32] + ldr q23, [x20, #48] + ldr q0, [x20, #32] + ldp x1, x10, [x20, #48] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc // cc = lo, ul, last + cneg x6, x14, cc // cc = lo, ul, last + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc // cc = lo, ul, last + cinv x2, x5, cc // cc = lo, ul, last + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff // #4294967295 + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 // #-4294967295 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs // cs = hs, nlast + csel x14, x8, x14, cs // cs = hs, nlast + csel x12, x11, x12, cs // cs = hs, nlast + csel x2, x5, x2, cs // cs = hs, nlast + stp x14, x12, [sp, #48] + stp x16, x2, [sp, #32] + ldp x5, x6, [x20] + subs x5, x5, x21 + sbcs x6, x6, x24 + ldp x7, x8, [x20, #16] + sbcs x7, x7, x22 + sbcs x8, x8, x23 + csetm x3, cc // cc = lo, ul, last + adds x10, x5, x3 + and x4, x3, #0xffffffff + adcs x25, x6, x4 + adcs x26, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x27, x8, x4 + stp x10, x25, [sp, #96] + stp x26, x27, [sp, #112] + ldp x5, x6, [x20] + adds x5, x5, x21 + adcs x6, x6, x24 + ldp x7, x8, [x20, #16] + adcs x7, x7, x22 + adcs x8, x8, x23 + csetm x3, cs // cs = hs, nlast + subs x9, x5, x3 + and x1, x3, #0xffffffff + sbcs x5, x6, x1 + sbcs x7, x7, xzr + and x2, x3, #0xffffffff00000001 + sbc x8, x8, x2 + stp x9, x5, [sp, #64] + stp x7, x8, [sp, #80] + ldr q20, [sp, #96] + ldr q0, [sp, #64] + rev64 v16.4s, v20.4s + subs x4, x9, x5 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x5, x25 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x7, x9 + ldr q20, [sp, #112] + sbcs x5, x8, x5 + ngc x17, xzr + subs x8, x7, x8 + uaddlp v27.2d, v16.4s + umulh x4, x9, x10 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x25, x10 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [sp, #80] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x10, x26 + sbcs x9, x25, x27 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x27, x26 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x21, x3, x13 + adcs x22, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x23, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x24, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x21 + adcs x15, x16, x22 + eor x5, x17, x4 + adcs x9, x1, x23 + eor x1, x10, x5 + adcs x16, x2, x24 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x21, x11, x13 + and x1, x1, x13 + adcs x22, x4, x1 + and x1, x12, x13 + stp x21, x22, [sp, #96] + adcs x23, x7, xzr + adc x24, x17, x1 + stp x23, x24, [sp, #112] + ldp x4, x5, [x20, #32] + ldp x8, x9, [x20, #64] + adds x4, x4, x8 + adcs x5, x5, x9 + ldp x6, x7, [x20, #48] + ldp x10, x11, [x20, #80] + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x3, xzr, xzr + adds x8, x4, #0x1 + mov x9, #0xffffffff // #4294967295 + sbcs x9, x5, x9 + sbcs x10, x6, xzr + mov x11, #0xffffffff00000001 // #-4294967295 + sbcs x11, x7, x11 + sbcs x3, x3, xzr + csel x4, x4, x8, cc // cc = lo, ul, last + csel x5, x5, x9, cc // cc = lo, ul, last + csel x6, x6, x10, cc // cc = lo, ul, last + csel x7, x7, x11, cc // cc = lo, ul, last + stp x4, x5, [sp, #64] + stp x6, x7, [sp, #80] + ldr q20, [sp, #32] + ldp x7, x17, [x20] + ldr q0, [x20] + ldp x6, x10, [sp, #32] + ldp x11, x15, [x20, #16] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #48] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [x20, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #48] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x20, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x20 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x20, x11, x13 + and x1, x1, x13 + adcs x25, x4, x1 + and x1, x12, x13 + stp x20, x25, [sp, #128] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #144] + ldr q19, [sp, #96] + ldr q23, [sp, #112] + ldr q0, [sp, #96] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x21, x22 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x21, x22 + umulh x15, x21, x23 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x21, x22 + mov x12, v1.d[0] + csetm x5, cc // cc = lo, ul, last + cneg x6, x14, cc // cc = lo, ul, last + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x24, x23 + mov x9, v1.d[1] + cneg x17, x2, cc // cc = lo, ul, last + cinv x2, x5, cc // cc = lo, ul, last + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x22, x24 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x23, x24 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x23, x24 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x24, x24 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x24, x24 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x23, x23 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x23, x23 + adcs x2, x2, x9 + mov x6, #0xffffffff // #4294967295 + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 // #-4294967295 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x21, x3, x16, cs // cs = hs, nlast + csel x22, x8, x14, cs // cs = hs, nlast + csel x23, x11, x12, cs // cs = hs, nlast + csel x24, x5, x2, cs // cs = hs, nlast + ldr q19, [sp, #64] + ldp x9, x13, [sp, #64] + ldr q23, [sp, #80] + ldr q0, [sp, #64] + ldp x1, x10, [sp, #80] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc // cc = lo, ul, last + cneg x6, x14, cc // cc = lo, ul, last + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc // cc = lo, ul, last + cinv x2, x5, cc // cc = lo, ul, last + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff // #4294967295 + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 // #-4294967295 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x13, x3, x16, cs // cs = hs, nlast + csel x14, x8, x14, cs // cs = hs, nlast + csel x15, x11, x12, cs // cs = hs, nlast + csel x26, x5, x2, cs // cs = hs, nlast + mov x1, #0x9 // #9 + mov x2, #0xffffffffffffffff // #-1 + subs x9, x2, x21 + mov x2, #0xffffffff // #4294967295 + sbcs x10, x2, x24 + ngcs x11, x22 + mov x2, #0xffffffff00000001 // #-4294967295 + sbc x12, x2, x23 + mul x3, x1, x9 + mul x4, x1, x10 + mul x5, x1, x11 + mul x6, x1, x12 + umulh x9, x1, x9 + umulh x10, x1, x10 + umulh x11, x1, x11 + umulh x7, x1, x12 + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, xzr + mov x1, #0xc // #12 + mul x8, x20, x1 + umulh x9, x20, x1 + adds x3, x3, x8 + mul x8, x25, x1 + umulh x10, x25, x1 + adcs x4, x4, x8 + ldp x11, x12, [sp, #144] + mul x8, x11, x1 + umulh x11, x11, x1 + adcs x5, x5, x8 + mul x8, x12, x1 + umulh x12, x12, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x8, x7, #0x1 + lsl x10, x8, #32 + adds x6, x6, x10 + adc x7, x7, xzr + neg x9, x8 + sub x10, x10, #0x1 + subs x3, x3, x9 + sbcs x4, x4, x10 + sbcs x5, x5, xzr + sbcs x6, x6, x8 + sbc x8, x7, x8 + adds x20, x3, x8 + and x9, x8, #0xffffffff + adcs x21, x4, x9 + adcs x22, x5, xzr + neg x10, x9 + adc x23, x6, x10 + stp x20, x21, [sp, #160] + stp x22, x23, [sp, #176] + mov x2, sp + ldp x4, x3, [x2] + subs x5, x13, x4 + sbcs x6, x26, x3 + ldp x4, x3, [x2, #16] + sbcs x7, x14, x4 + sbcs x8, x15, x3 + csetm x3, cc // cc = lo, ul, last + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #64] + stp x7, x8, [sp, #80] + mov x0, sp + ldr q19, [sp, #32] + ldp x9, x13, [sp, #32] + ldr q23, [sp, #48] + ldr q0, [sp, #32] + ldp x1, x10, [sp, #48] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc // cc = lo, ul, last + cneg x6, x14, cc // cc = lo, ul, last + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc // cc = lo, ul, last + cinv x2, x5, cc // cc = lo, ul, last + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff // #4294967295 + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 // #-4294967295 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x24, x3, x16, cs // cs = hs, nlast + csel x25, x8, x14, cs // cs = hs, nlast + csel x26, x11, x12, cs // cs = hs, nlast + csel x27, x5, x2, cs // cs = hs, nlast + stp x25, x26, [x0, #16] + stp x24, x27, [x0] + ldr q20, [sp, #96] + ldr q0, [sp, #160] + ldp x6, x10, [sp, #96] + rev64 v16.4s, v20.4s + subs x4, x20, x21 + csetm x3, cc // cc = lo, ul, last + cneg x13, x4, cc // cc = lo, ul, last + mul v16.4s, v16.4s, v0.4s + umulh x12, x21, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x22, x20 + ldr q20, [sp, #112] + sbcs x5, x23, x21 + ngc x17, xzr + subs x8, x22, x23 + uaddlp v27.2d, v16.4s + umulh x4, x20, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc // cc = lo, ul, last + shl v17.2d, v27.2d, #32 + csetm x15, cc // cc = lo, ul, last + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc // cc = lo, ul, last + cinv x9, x3, cc // cc = lo, ul, last + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #112] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc // cc = lo, ul, last + cneg x6, x6, cc // cc = lo, ul, last + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x20, x3, x13 + adcs x21, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x22, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x23, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc // cc = lo, ul, last + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc // cc = lo, ul, last + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc // cc = lo, ul, last + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc // cc = lo, ul, last + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x20 + adcs x15, x16, x21 + eor x5, x17, x4 + adcs x9, x1, x22 + eor x1, x10, x5 + adcs x16, x2, x23 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff // #4294967295 + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 // #-4294967295 + adds x14, x11, x13 + and x1, x1, x13 + adcs x15, x4, x1 + and x1, x12, x13 + stp x14, x15, [sp, #96] + adcs x13, x7, xzr + adc x20, x17, x1 + stp x13, x20, [sp, #112] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #32] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #48] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc // cc = lo, ul, last + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [x19, #64] + stp x7, x8, [x19, #80] + ldp x1, x2, [sp, #128] + lsl x0, x1, #2 + ldp x6, x7, [sp, #160] + subs x0, x0, x6 + extr x1, x2, x1, #62 + sbcs x1, x1, x7 + ldp x3, x4, [sp, #144] + extr x2, x3, x2, #62 + ldp x6, x7, [sp, #176] + sbcs x2, x2, x6 + extr x3, x4, x3, #62 + sbcs x3, x3, x7 + lsr x4, x4, #62 + sbc x4, x4, xzr + add x5, x4, #0x1 + lsl x8, x5, #32 + negs x6, x8 + ngcs x7, xzr + sbc x8, x8, x5 + adds x0, x0, x5 + adcs x1, x1, x6 + adcs x2, x2, x7 + adcs x3, x3, x8 + csetm x5, cc // cc = lo, ul, last + adds x0, x0, x5 + and x6, x5, #0xffffffff + adcs x1, x1, x6 + adcs x2, x2, xzr + neg x7, x6 + adc x3, x3, x7 + stp x0, x1, [x19] + stp x2, x3, [x19, #16] + mov x2, #0xffffffffffffffff // #-1 + subs x9, x2, x24 + mov x2, #0xffffffff // #4294967295 + sbcs x10, x2, x27 + ngcs x11, x25 + mov x2, #0xffffffff00000001 // #-4294967295 + sbc x12, x2, x26 + lsl x3, x9, #3 + extr x4, x10, x9, #61 + extr x5, x11, x10, #61 + extr x6, x12, x11, #61 + lsr x7, x12, #61 + mov x1, #0x3 // #3 + mul x8, x14, x1 + umulh x9, x14, x1 + adds x3, x3, x8 + mul x8, x15, x1 + umulh x10, x15, x1 + adcs x4, x4, x8 + mul x8, x13, x1 + umulh x11, x13, x1 + adcs x5, x5, x8 + mul x8, x20, x1 + umulh x12, x20, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x8, x7, #0x1 + lsl x10, x8, #32 + adds x6, x6, x10 + adc x7, x7, xzr + neg x9, x8 + sub x10, x10, #0x1 + subs x3, x3, x9 + sbcs x4, x4, x10 + sbcs x5, x5, xzr + sbcs x6, x6, x8 + sbc x8, x7, x8 + adds x3, x3, x8 + and x9, x8, #0xffffffff + adcs x4, x4, x9 + adcs x5, x5, xzr + neg x10, x9 + adc x6, x6, x10 + stp x3, x4, [x19, #32] + stp x5, x6, [x19, #48] + ldp x27, xzr, [sp, #256] + ldp x25, x26, [sp, #240] + ldp x23, x24, [sp, #224] + ldp x21, x22, [sp, #208] + ldp x19, x20, [sp, #192] + add sp, sp, #0x110 + ret + +p256_scalarmul_local_p256_montjmixadd: + stp x19, x20, [sp, #-16]! + sub sp, sp, #0xc0 + mov x17, x0 + mov x19, x1 + mov x20, x2 + ldp x2, x3, [x19, #64] + ldp x4, x5, [x19, #80] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x0, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x0, x0, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x0, x0, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + subs x13, x15, x12 + lsr x11, x15, #32 + sbc x15, x15, x11 + adds x16, x16, x12 + adcs x0, x0, x11 + adcs x1, x1, x13 + adc x15, x15, xzr + lsl x12, x16, #32 + subs x13, x16, x12 + lsr x11, x16, #32 + sbc x16, x16, x11 + adds x0, x0, x12 + adcs x1, x1, x11 + adcs x15, x15, x13 + adc x16, x16, xzr + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, cc + csetm x13, cc + subs x12, x5, x4 + cneg x12, x12, cc + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, cc + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x0 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + subs x13, x6, x12 + lsr x11, x6, #32 + sbc x6, x6, x11 + adds x7, x7, x12 + adcs x8, x8, x11 + adcs x9, x9, x13 + adcs x10, x10, x6 + adc x6, xzr, xzr + lsl x12, x7, #32 + subs x13, x7, x12 + lsr x11, x7, #32 + sbc x7, x7, x11 + adds x8, x8, x12 + adcs x9, x9, x11 + adcs x10, x10, x13 + adcs x6, x6, x7 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #0xffffffff + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, cs + csel x9, x11, x9, cs + csel x10, x12, x10, cs + csel x6, x13, x6, cs + stp x8, x9, [sp] + stp x10, x6, [sp, #16] + ldp x3, x4, [x19, #64] + ldp x5, x6, [x19, #80] + ldp x7, x8, [x20, #32] + ldp x9, x10, [x20, #48] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #32] + stp x11, x12, [sp, #48] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #32] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #48] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #32] + stp x3, x4, [sp, #48] + ldp x3, x4, [sp] + ldp x5, x6, [sp, #16] + ldp x7, x8, [x20] + ldp x9, x10, [x20, #16] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #64] + stp x11, x12, [sp, #80] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #64] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #80] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #64] + stp x3, x4, [sp, #80] + ldp x3, x4, [sp] + ldp x5, x6, [sp, #16] + ldp x7, x8, [sp, #32] + ldp x9, x10, [sp, #48] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #32] + stp x11, x12, [sp, #48] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #32] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #48] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #32] + stp x3, x4, [sp, #48] + ldp x5, x6, [sp, #64] + ldp x4, x3, [x19] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [x19, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #160] + stp x7, x8, [sp, #176] + ldp x5, x6, [sp, #32] + ldp x4, x3, [x19, #32] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #48] + ldp x4, x3, [x19, #48] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #32] + stp x7, x8, [sp, #48] + ldp x2, x3, [sp, #160] + ldp x4, x5, [sp, #176] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x0, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x0, x0, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x0, x0, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + subs x13, x15, x12 + lsr x11, x15, #32 + sbc x15, x15, x11 + adds x16, x16, x12 + adcs x0, x0, x11 + adcs x1, x1, x13 + adc x15, x15, xzr + lsl x12, x16, #32 + subs x13, x16, x12 + lsr x11, x16, #32 + sbc x16, x16, x11 + adds x0, x0, x12 + adcs x1, x1, x11 + adcs x15, x15, x13 + adc x16, x16, xzr + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, cc + csetm x13, cc + subs x12, x5, x4 + cneg x12, x12, cc + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, cc + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x0 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + subs x13, x6, x12 + lsr x11, x6, #32 + sbc x6, x6, x11 + adds x7, x7, x12 + adcs x8, x8, x11 + adcs x9, x9, x13 + adcs x10, x10, x6 + adc x6, xzr, xzr + lsl x12, x7, #32 + subs x13, x7, x12 + lsr x11, x7, #32 + sbc x7, x7, x11 + adds x8, x8, x12 + adcs x9, x9, x11 + adcs x10, x10, x13 + adcs x6, x6, x7 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #0xffffffff + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, cs + csel x9, x11, x9, cs + csel x10, x12, x10, cs + csel x6, x13, x6, cs + stp x8, x9, [sp, #96] + stp x10, x6, [sp, #112] + ldp x2, x3, [sp, #32] + ldp x4, x5, [sp, #48] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x0, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x0, x0, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x0, x0, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + subs x13, x15, x12 + lsr x11, x15, #32 + sbc x15, x15, x11 + adds x16, x16, x12 + adcs x0, x0, x11 + adcs x1, x1, x13 + adc x15, x15, xzr + lsl x12, x16, #32 + subs x13, x16, x12 + lsr x11, x16, #32 + sbc x16, x16, x11 + adds x0, x0, x12 + adcs x1, x1, x11 + adcs x15, x15, x13 + adc x16, x16, xzr + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, cc + csetm x13, cc + subs x12, x5, x4 + cneg x12, x12, cc + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, cc + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x0 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + subs x13, x6, x12 + lsr x11, x6, #32 + sbc x6, x6, x11 + adds x7, x7, x12 + adcs x8, x8, x11 + adcs x9, x9, x13 + adcs x10, x10, x6 + adc x6, xzr, xzr + lsl x12, x7, #32 + subs x13, x7, x12 + lsr x11, x7, #32 + sbc x7, x7, x11 + adds x8, x8, x12 + adcs x9, x9, x11 + adcs x10, x10, x13 + adcs x6, x6, x7 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #0xffffffff + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, cs + csel x9, x11, x9, cs + csel x10, x12, x10, cs + csel x6, x13, x6, cs + stp x8, x9, [sp] + stp x10, x6, [sp, #16] + ldp x3, x4, [sp, #96] + ldp x5, x6, [sp, #112] + ldp x7, x8, [x19] + ldp x9, x10, [x19, #16] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #128] + stp x11, x12, [sp, #144] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #128] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #144] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #128] + stp x3, x4, [sp, #144] + ldp x3, x4, [sp, #96] + ldp x5, x6, [sp, #112] + ldp x7, x8, [sp, #64] + ldp x9, x10, [sp, #80] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #64] + stp x11, x12, [sp, #80] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #64] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #80] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #64] + stp x3, x4, [sp, #80] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #16] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #16] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #96] + stp x7, x8, [sp, #112] + ldp x3, x4, [sp, #160] + ldp x5, x6, [sp, #176] + ldp x7, x8, [x19, #64] + ldp x9, x10, [x19, #80] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #160] + stp x11, x12, [sp, #176] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #160] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #176] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #160] + stp x3, x4, [sp, #176] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #64] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #16] + ldp x4, x3, [sp, #80] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #16] + ldp x5, x6, [sp, #128] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #144] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldp x3, x4, [sp, #96] + ldp x5, x6, [sp, #112] + ldp x7, x8, [x19, #32] + ldp x9, x10, [x19, #48] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #96] + stp x11, x12, [sp, #112] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #96] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #112] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #96] + stp x3, x4, [sp, #112] + ldp x3, x4, [sp, #32] + ldp x5, x6, [sp, #48] + ldp x7, x8, [sp, #128] + ldp x9, x10, [sp, #144] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #128] + stp x11, x12, [sp, #144] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #128] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #144] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #128] + stp x3, x4, [sp, #144] + ldp x5, x6, [sp, #128] + ldp x4, x3, [sp, #96] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #144] + ldp x4, x3, [sp, #112] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldp x0, x1, [x19, #64] + ldp x2, x3, [x19, #80] + orr x4, x0, x1 + orr x5, x2, x3 + orr x4, x4, x5 + cmp x4, xzr + ldp x0, x1, [sp] + ldp x12, x13, [x20] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x2, x3, [sp, #16] + ldp x12, x13, [x20, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x4, x5, [sp, #128] + ldp x12, x13, [x20, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x6, x7, [sp, #144] + ldp x12, x13, [x20, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x8, x9, [sp, #160] + mov x12, #0x1 + mov x13, #0xffffffff00000000 + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x10, x11, [sp, #176] + mov x12, #0xffffffffffffffff + mov x13, #0xfffffffe + csel x10, x10, x12, ne + csel x11, x11, x13, ne + stp x0, x1, [x17] + stp x2, x3, [x17, #16] + stp x4, x5, [x17, #32] + stp x6, x7, [x17, #48] + stp x8, x9, [x17, #64] + stp x10, x11, [x17, #80] + add sp, sp, #0xc0 + ldp x19, x20, [sp], #16 + ret + + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmul_alt.S new file mode 100644 index 00000000000..77e3349e34c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmul_alt.S @@ -0,0 +1,6190 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Scalar multiplication for P-256 +// Input scalar[4], point[8]; output res[8] +// +// extern void p256_scalarmul_alt +// (uint64_t res[static 8], +// uint64_t scalar[static 4], +// uint64_t point[static 8]); +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-256, returns the point (X,Y) = n * P. The input and output +// are affine points, and in the case of the point at infinity as +// the result, (0,0) is returned. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmul_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmul_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Safe copies of inputs (res lasts the whole code, point not so long) +// and additional values in variables, with some aliasing + +#define res x19 +#define sgn x20 +#define j x20 +#define point x21 + +// Intermediate variables on the stack. The last z2, z3 values can +// safely be overlaid on the table, which is no longer needed at the end. + +#define scalarb sp, #(0*NUMSIZE) +#define acc sp, #(1*NUMSIZE) +#define tabent sp, #(4*NUMSIZE) + +#define tab sp, #(7*NUMSIZE) + +#define z2 sp, #(7*NUMSIZE) +#define z3 sp, #(8*NUMSIZE) + +#define NSPACE #(31*NUMSIZE) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(p256_scalarmul_alt): + + stp x19, x20, [sp, #-16]! + stp x21, x30, [sp, #-16]! + sub sp, sp, NSPACE + +// Preserve the "res" and "point" input arguments. We load and process the +// scalar immediately so we don't bother preserving that input argument. +// Also, "point" is only needed early on and so its register gets re-used. + + mov res, x0 + mov point, x2 + +// Load the digits of group order n_256 = [x12;x13;x14;x15] + + movbig(x12, #0xf3b9, #0xcac2, #0xfc63, #0x2551) + movbig(x13, #0xbce6, #0xfaad, #0xa717, #0x9e84) + mov x14, #0xffffffffffffffff + mov x15, #0xffffffff00000000 + +// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256 + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + + subs x6, x2, x12 + sbcs x7, x3, x13 + sbcs x8, x4, x14 + sbcs x9, x5, x15 + + csel x2, x2, x6, cc + csel x3, x3, x7, cc + csel x4, x4, x8, cc + csel x5, x5, x9, cc + +// Now if the top bit of the reduced scalar is set, negate it mod n_256, +// i.e. do n |-> n_256 - n. Remember the sign as "sgn" so we can +// correspondingly negate the point below. + + subs x6, x12, x2 + sbcs x7, x13, x3 + sbcs x8, x14, x4 + sbc x9, x15, x5 + + tst x5, #0x8000000000000000 + csel x2, x2, x6, eq + csel x3, x3, x7, eq + csel x4, x4, x8, eq + csel x5, x5, x9, eq + cset sgn, ne + +// In either case then add the recoding constant 0x08888...888 to allow +// signed digits. + + mov x6, 0x8888888888888888 + adds x2, x2, x6 + adcs x3, x3, x6 + bic x7, x6, #0xF000000000000000 + adcs x4, x4, x6 + adc x5, x5, x7 + + stp x2, x3, [scalarb] + stp x4, x5, [scalarb+16] + +// Set the tab[0] table entry to Montgomery-Jacobian point = 1 * P +// The z coordinate is just the Montgomery form of the constant 1. + + add x0, tab + mov x1, point + bl p256_scalarmul_alt_local_tomont_p256 + + add x1, point, #32 + add x0, tab+32 + bl p256_scalarmul_alt_local_tomont_p256 + + mov x0, #0x0000000000000001 + mov x1, #0xffffffff00000000 + stp x0, x1, [tab+64] + mov x2, #0xffffffffffffffff + mov x3, #0x00000000fffffffe + stp x2, x3, [tab+80] + +// If the top bit of the scalar was set, negate (y coordinate of) the point + + ldp x4, x5, [tab+32] + ldp x6, x7, [tab+48] + + mov x0, 0xffffffffffffffff + subs x0, x0, x4 + mov x1, 0x00000000ffffffff + sbcs x1, x1, x5 + mov x3, 0xffffffff00000001 + sbcs x2, xzr, x6 + sbc x3, x3, x7 + + cmp sgn, xzr + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tab+32] + stp x6, x7, [tab+48] + +// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P + + add x0, tab+96*1 + add x1, tab + bl p256_scalarmul_alt_local_p256_montjdouble + + add x0, tab+96*2 + add x1, tab+96*1 + add x2, tab + bl p256_scalarmul_alt_local_p256_montjmixadd + + add x0, tab+96*3 + add x1, tab+96*1 + bl p256_scalarmul_alt_local_p256_montjdouble + + add x0, tab+96*4 + add x1, tab+96*3 + add x2, tab + bl p256_scalarmul_alt_local_p256_montjmixadd + + add x0, tab+96*5 + add x1, tab+96*2 + bl p256_scalarmul_alt_local_p256_montjdouble + + add x0, tab+96*6 + add x1, tab+96*5 + add x2, tab + bl p256_scalarmul_alt_local_p256_montjmixadd + + add x0, tab+96*7 + add x1, tab+96*3 + bl p256_scalarmul_alt_local_p256_montjdouble + +// Initialize the accumulator as a table entry for top 4 bits (unrecoded) + + ldr x14, [scalarb+24] + lsr x14, x14, #60 + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + add x15, tab + + .set i, 1 +.rep 8 + cmp x14, #i + ldp x12, x13, [x15] + csel x0, x12, x0, eq + csel x1, x13, x1, eq + ldp x12, x13, [x15, #16] + csel x2, x12, x2, eq + csel x3, x13, x3, eq + ldp x12, x13, [x15, #32] + csel x4, x12, x4, eq + csel x5, x13, x5, eq + ldp x12, x13, [x15, #48] + csel x6, x12, x6, eq + csel x7, x13, x7, eq + ldp x12, x13, [x15, #64] + csel x8, x12, x8, eq + csel x9, x13, x9, eq + ldp x12, x13, [x15, #80] + csel x10, x12, x10, eq + csel x11, x13, x11, eq + add x15, x15, #96 + .set i, (i+1) +.endr + stp x0, x1, [acc] + stp x2, x3, [acc+16] + stp x4, x5, [acc+32] + stp x6, x7, [acc+48] + stp x8, x9, [acc+64] + stp x10, x11, [acc+80] + + mov j, #252 + +// Main loop over size-4 bitfields: double 4 times then add signed digit + +p256_scalarmul_alt_loop: + sub j, j, #4 + + add x0, acc + add x1, acc + bl p256_scalarmul_alt_local_p256_montjdouble + + add x0, acc + add x1, acc + bl p256_scalarmul_alt_local_p256_montjdouble + + add x0, acc + add x1, acc + bl p256_scalarmul_alt_local_p256_montjdouble + + add x0, acc + add x1, acc + bl p256_scalarmul_alt_local_p256_montjdouble + + lsr x2, j, #6 + ldr x14, [sp, x2, lsl #3] // Exploits scalarb = sp exactly + lsr x14, x14, j + and x14, x14, #15 + + subs x14, x14, #8 + cset x16, lo // x16 = sign of digit (1 = negative) + cneg x14, x14, lo // x14 = absolute value of digit + +// Conditionally select the table entry tab[i-1] = i * P in constant time + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + add x15, tab + .set i, 1 +.rep 8 + cmp x14, #i + ldp x12, x13, [x15] + csel x0, x12, x0, eq + csel x1, x13, x1, eq + ldp x12, x13, [x15, #16] + csel x2, x12, x2, eq + csel x3, x13, x3, eq + ldp x12, x13, [x15, #32] + csel x4, x12, x4, eq + csel x5, x13, x5, eq + ldp x12, x13, [x15, #48] + csel x6, x12, x6, eq + csel x7, x13, x7, eq + ldp x12, x13, [x15, #64] + csel x8, x12, x8, eq + csel x9, x13, x9, eq + ldp x12, x13, [x15, #80] + csel x10, x12, x10, eq + csel x11, x13, x11, eq + add x15, x15, #96 + .set i, (i+1) +.endr + +// Store it to "tabent" with the y coordinate optionally negated + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + + mov x0, 0xffffffffffffffff + subs x0, x0, x4 + mov x1, 0x00000000ffffffff + sbcs x1, x1, x5 + mov x3, 0xffffffff00000001 + sbcs x2, xzr, x6 + sbc x3, x3, x7 + + cmp x16, xzr + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + stp x8, x9, [tabent+64] + stp x10, x11, [tabent+80] + + add x0, acc + add x1, acc + add x2, tabent + bl p256_scalarmul_alt_local_p256_montjadd + + cbnz j, p256_scalarmul_alt_loop + +// That's the end of the main loop, and we just need to translate +// back from the Jacobian representation to affine. First of all, +// let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form + + add x0, z2 + add x1, acc+64 + bl p256_scalarmul_alt_local_montsqr_p256 + + add x0, z3 + add x2, z2 + add x1, acc+64 + bl p256_scalarmul_alt_local_montmul_p256 + + add x0, z2 + add x1, z3 + bl p256_scalarmul_alt_local_demont_p256 + + add x0, z3 + add x1, z2 + bl p256_scalarmul_alt_local_inv_p256 + + add x0, z2 + add x2, z3 + add x1, acc+64 + bl p256_scalarmul_alt_local_montmul_p256 + +// Convert back from Jacobian (X,Y,Z) |-> (X/Z^2, Y/Z^3) + + add x1, acc + add x2, z2 + mov x0, res + bl p256_scalarmul_alt_local_montmul_p256 + + add x0, res, #32 + add x1, acc+32 + add x2, z3 + bl p256_scalarmul_alt_local_montmul_p256 + +// Restore stack and registers and return + + add sp, sp, NSPACE + ldp x21, x30, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// Local copies of subroutines, complete clones at the moment + +p256_scalarmul_alt_local_demont_p256: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + lsl x7, x2, #32 + subs x8, x2, x7 + lsr x6, x2, #32 + sbc x2, x2, x6 + adds x3, x3, x7 + adcs x4, x4, x6 + adcs x5, x5, x8 + adc x2, x2, xzr + lsl x7, x3, #32 + subs x8, x3, x7 + lsr x6, x3, #32 + sbc x3, x3, x6 + adds x4, x4, x7 + adcs x5, x5, x6 + adcs x2, x2, x8 + adc x3, x3, xzr + lsl x7, x4, #32 + subs x8, x4, x7 + lsr x6, x4, #32 + sbc x4, x4, x6 + adds x5, x5, x7 + adcs x2, x2, x6 + adcs x3, x3, x8 + adc x4, x4, xzr + lsl x7, x5, #32 + subs x8, x5, x7 + lsr x6, x5, #32 + sbc x5, x5, x6 + adds x2, x2, x7 + adcs x3, x3, x6 + adcs x4, x4, x8 + adc x5, x5, xzr + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + +p256_scalarmul_alt_local_inv_p256: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + sub sp, sp, #0xa0 + mov x20, x0 + mov x10, #0xffffffffffffffff + mov x11, #0xffffffff + mov x13, #0xffffffff00000001 + stp x10, x11, [sp] + stp xzr, x13, [sp, #16] + str xzr, [sp, #32] + ldp x2, x3, [x1] + subs x10, x2, x10 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #16] + sbcs x12, x4, xzr + sbcs x13, x5, x13 + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + stp x2, x3, [sp, #48] + stp x4, x5, [sp, #64] + str xzr, [sp, #80] + stp xzr, xzr, [sp, #96] + stp xzr, xzr, [sp, #112] + mov x10, #0x4000000000000 + stp x10, xzr, [sp, #128] + stp xzr, xzr, [sp, #144] + mov x21, #0xa + mov x22, #0x1 + b p256_scalarmul_alt_inv_midloop +p256_scalarmul_alt_inv_loop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #48] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #64] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #56] + ldr x7, [sp, #24] + eor x1, x7, x14 + ldr x23, [sp, #32] + eor x3, x23, x14 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #72] + eor x1, x8, x15 + ldr x24, [sp, #80] + eor x0, x24, x15 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + asr x3, x3, #59 + str x3, [sp, #32] + eor x1, x7, x16 + eor x5, x23, x16 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + eor x0, x24, x17 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #64] + extr x2, x5, x2, #59 + str x2, [sp, #72] + asr x5, x5, #59 + str x5, [sp, #80] + ldr x7, [sp, #96] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #128] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #96] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #128] + adc x3, x3, x1 + ldr x7, [sp, #104] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #136] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #104] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #136] + adc x4, x4, x1 + ldr x7, [sp, #112] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #144] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #112] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #144] + adc x2, x2, x1 + ldr x7, [sp, #120] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #152] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + ldp x0, x1, [sp, #96] + ldr x6, [sp, #112] + mov x14, #0xe000000000000000 + adds x0, x0, x14 + sbcs x1, x1, xzr + mov x11, #0x1fffffff + adcs x6, x6, x11 + mov x10, #0x2000000000000000 + adcs x5, x5, x10 + mov x14, #0x1fffffffe0000000 + adc x3, x3, x14 + lsl x11, x0, #32 + subs x14, x0, x11 + lsr x10, x0, #32 + sbc x0, x0, x10 + adds x1, x1, x11 + adcs x6, x6, x10 + adcs x5, x5, x14 + adcs x3, x3, x0 + mov x14, #0xffffffffffffffff + mov x11, #0xffffffff + mov x10, #0xffffffff00000001 + csel x14, x14, xzr, cs + csel x11, x11, xzr, cs + csel x10, x10, xzr, cs + subs x1, x1, x14 + sbcs x6, x6, x11 + sbcs x5, x5, xzr + sbc x3, x3, x10 + stp x1, x6, [sp, #96] + stp x5, x3, [sp, #112] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + ldp x0, x1, [sp, #128] + ldr x3, [sp, #144] + mov x14, #0xe000000000000000 + adds x0, x0, x14 + sbcs x1, x1, xzr + mov x11, #0x1fffffff + adcs x3, x3, x11 + mov x10, #0x2000000000000000 + adcs x2, x2, x10 + mov x14, #0x1fffffffe0000000 + adc x5, x5, x14 + lsl x11, x0, #32 + subs x14, x0, x11 + lsr x10, x0, #32 + sbc x0, x0, x10 + adds x1, x1, x11 + adcs x3, x3, x10 + adcs x2, x2, x14 + adcs x5, x5, x0 + mov x14, #0xffffffffffffffff + mov x11, #0xffffffff + mov x10, #0xffffffff00000001 + csel x14, x14, xzr, cs + csel x11, x11, xzr, cs + csel x10, x10, xzr, cs + subs x1, x1, x14 + sbcs x3, x3, x11 + sbcs x2, x2, xzr + sbc x5, x5, x10 + stp x1, x3, [sp, #128] + stp x2, x5, [sp, #144] +p256_scalarmul_alt_inv_midloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #48] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + b.ne p256_scalarmul_alt_inv_loop + ldr x0, [sp] + ldr x1, [sp, #48] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #96] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #128] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #96] + adc x2, x2, x1 + ldr x7, [sp, #104] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #136] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #104] + adc x6, x6, x1 + ldr x7, [sp, #112] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #144] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #112] + adc x5, x5, x1 + ldr x7, [sp, #120] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #152] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + ldp x0, x1, [sp, #96] + ldr x2, [sp, #112] + mov x14, #0xe000000000000000 + adds x0, x0, x14 + sbcs x1, x1, xzr + mov x11, #0x1fffffff + adcs x2, x2, x11 + mov x10, #0x2000000000000000 + adcs x5, x5, x10 + mov x14, #0x1fffffffe0000000 + adc x3, x3, x14 + lsl x11, x0, #32 + subs x14, x0, x11 + lsr x10, x0, #32 + sbc x0, x0, x10 + adds x1, x1, x11 + adcs x2, x2, x10 + adcs x5, x5, x14 + adcs x3, x3, x0 + mov x14, #0xffffffffffffffff + mov x11, #0xffffffff + mov x10, #0xffffffff00000001 + csel x14, x14, xzr, cs + csel x11, x11, xzr, cs + csel x10, x10, xzr, cs + subs x1, x1, x14 + sbcs x2, x2, x11 + sbcs x5, x5, xzr + sbc x3, x3, x10 + mov x10, #0xffffffffffffffff + subs x10, x1, x10 + mov x11, #0xffffffff + sbcs x11, x2, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x5, xzr + sbcs x13, x3, x13 + csel x10, x1, x10, cc + csel x11, x2, x11, cc + csel x12, x5, x12, cc + csel x13, x3, x13, cc + stp x10, x11, [x20] + stp x12, x13, [x20, #16] + add sp, sp, #0xa0 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +p256_scalarmul_alt_local_montmul_p256: + ldp x3, x4, [x1] + ldp x7, x8, [x2] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x2, #16] + mul x11, x3, x9 + umulh x15, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x16, x3, x10 + adcs x15, x15, x11 + adc x16, x16, xzr + ldp x5, x6, [x1, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x15, x15, x11 + mul x11, x4, x10 + adcs x16, x16, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x15, x15, x11 + umulh x11, x4, x9 + adcs x16, x16, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x15, x15, x11 + mul x11, x5, x9 + adcs x16, x16, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x15, x15, x11 + umulh x11, x5, x8 + adcs x16, x16, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x15, x15, x11 + mul x11, x6, x8 + adcs x16, x16, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x15, x15, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x16, x16, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x15, x15, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x15, x15, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x15, lsl #32 + lsr x11, x15, #32 + adcs x13, x13, x11 + mul x11, x15, x10 + umulh x15, x15, x10 + adcs x14, x14, x11 + adc x15, x15, xzr + adds x12, x12, x16 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x15, x15, x5 + cset x8, cs + mov x11, #0xffffffff + adds x16, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x15, x10 + sbcs xzr, x8, xzr + csel x12, x12, x16, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x15, x15, x5, cc + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + ret + +p256_scalarmul_alt_local_montsqr_p256: + ldp x2, x3, [x1] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x1, #16] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + mov x5, #0xffffffff00000001 + adds x9, x9, x8, lsl #32 + lsr x2, x8, #32 + adcs x10, x10, x2 + mul x2, x8, x5 + umulh x8, x8, x5 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x2, x9, #32 + adcs x11, x11, x2 + mul x2, x9, x5 + umulh x9, x9, x5 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x2, x10, #32 + adcs x8, x8, x2 + mul x2, x10, x5 + umulh x10, x10, x5 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x2, x11, #32 + adcs x9, x9, x2 + mul x2, x11, x5 + umulh x11, x11, x5 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + ret + +p256_scalarmul_alt_local_tomont_p256: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + mov x1, #0xffffffffffffffff + mov x7, #0xffffffff + mov x9, #0xffffffff00000001 + subs x1, x2, x1 + sbcs x7, x3, x7 + sbcs x8, x4, xzr + sbcs x9, x5, x9 + csel x2, x2, x1, cc + csel x3, x3, x7, cc + csel x4, x4, x8, cc + csel x5, x5, x9, cc + cmp xzr, xzr + extr x9, x5, x4, #32 + adcs xzr, x4, x9 + lsr x9, x5, #32 + adcs x9, x5, x9 + csetm x6, cs + orr x9, x9, x6 + lsl x7, x9, #32 + lsr x8, x9, #32 + adds x4, x4, x7 + adc x5, x5, x8 + negs x6, x9 + sbcs x7, x7, xzr + sbc x8, x8, xzr + negs x6, x6 + sbcs x2, x2, x7 + sbcs x3, x3, x8 + sbcs x4, x4, x9 + sbcs x5, x5, x9 + adds x6, x6, x5 + mov x7, #0xffffffff + and x7, x7, x5 + adcs x2, x2, x7 + adcs x3, x3, xzr + mov x7, #0xffffffff00000001 + and x7, x7, x5 + adc x4, x4, x7 + cmp xzr, xzr + extr x9, x4, x3, #32 + adcs xzr, x3, x9 + lsr x9, x4, #32 + adcs x9, x4, x9 + csetm x5, cs + orr x9, x9, x5 + lsl x7, x9, #32 + lsr x8, x9, #32 + adds x3, x3, x7 + adc x4, x4, x8 + negs x5, x9 + sbcs x7, x7, xzr + sbc x8, x8, xzr + negs x5, x5 + sbcs x6, x6, x7 + sbcs x2, x2, x8 + sbcs x3, x3, x9 + sbcs x4, x4, x9 + adds x5, x5, x4 + mov x7, #0xffffffff + and x7, x7, x4 + adcs x6, x6, x7 + adcs x2, x2, xzr + mov x7, #0xffffffff00000001 + and x7, x7, x4 + adc x3, x3, x7 + cmp xzr, xzr + extr x9, x3, x2, #32 + adcs xzr, x2, x9 + lsr x9, x3, #32 + adcs x9, x3, x9 + csetm x4, cs + orr x9, x9, x4 + lsl x7, x9, #32 + lsr x8, x9, #32 + adds x2, x2, x7 + adc x3, x3, x8 + negs x4, x9 + sbcs x7, x7, xzr + sbc x8, x8, xzr + negs x4, x4 + sbcs x5, x5, x7 + sbcs x6, x6, x8 + sbcs x2, x2, x9 + sbcs x3, x3, x9 + adds x4, x4, x3 + mov x7, #0xffffffff + and x7, x7, x3 + adcs x5, x5, x7 + adcs x6, x6, xzr + mov x7, #0xffffffff00000001 + and x7, x7, x3 + adc x2, x2, x7 + cmp xzr, xzr + extr x9, x2, x6, #32 + adcs xzr, x6, x9 + lsr x9, x2, #32 + adcs x9, x2, x9 + csetm x3, cs + orr x9, x9, x3 + lsl x7, x9, #32 + lsr x8, x9, #32 + adds x6, x6, x7 + adc x2, x2, x8 + negs x3, x9 + sbcs x7, x7, xzr + sbc x8, x8, xzr + negs x3, x3 + sbcs x4, x4, x7 + sbcs x5, x5, x8 + sbcs x6, x6, x9 + sbcs x2, x2, x9 + adds x3, x3, x2 + mov x7, #0xffffffff + and x7, x7, x2 + adcs x4, x4, x7 + adcs x5, x5, xzr + mov x7, #0xffffffff00000001 + and x7, x7, x2 + adc x6, x6, x7 + stp x3, x4, [x0] + stp x5, x6, [x0, #16] + ret + +p256_scalarmul_alt_local_p256_montjadd: + sub sp, sp, #0xe0 + mov x15, x0 + mov x16, x1 + mov x17, x2 + ldp x2, x3, [x16, #64] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x16, #80] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + mov x2, #0xffffffffffffffff + csel x2, xzr, x2, cc + mov x3, #0xffffffff + csel x3, xzr, x3, cc + mov x5, #0xffffffff00000001 + csel x5, xzr, x5, cc + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, xzr + sbc x11, x11, x5 + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + ldp x2, x3, [x17, #64] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x17, #80] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + mov x2, #0xffffffffffffffff + csel x2, xzr, x2, cc + mov x3, #0xffffffff + csel x3, xzr, x3, cc + mov x5, #0xffffffff00000001 + csel x5, xzr, x5, cc + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, xzr + sbc x11, x11, x5 + stp x8, x9, [sp, #160] + stp x10, x11, [sp, #176] + ldp x3, x4, [x17, #64] + ldp x7, x8, [x16, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [x17, #80] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #192] + stp x14, x0, [sp, #208] + ldp x3, x4, [x16, #64] + ldp x7, x8, [x17, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [x16, #80] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #32] + stp x14, x0, [sp, #48] + ldp x3, x4, [sp] + ldp x7, x8, [x17] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #16] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #64] + stp x14, x0, [sp, #80] + ldp x3, x4, [sp, #160] + ldp x7, x8, [x16] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #16] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #176] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #128] + stp x14, x0, [sp, #144] + ldp x3, x4, [sp] + ldp x7, x8, [sp, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #32] + stp x14, x0, [sp, #48] + ldp x3, x4, [sp, #160] + ldp x7, x8, [sp, #192] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #208] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #176] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #192] + stp x14, x0, [sp, #208] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #160] + stp x7, x8, [sp, #176] + ldp x5, x6, [sp, #32] + ldp x4, x3, [sp, #192] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #48] + ldp x4, x3, [sp, #208] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #32] + stp x7, x8, [sp, #48] + ldp x2, x3, [sp, #160] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #176] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + mov x2, #0xffffffffffffffff + csel x2, xzr, x2, cc + mov x3, #0xffffffff + csel x3, xzr, x3, cc + mov x5, #0xffffffff00000001 + csel x5, xzr, x5, cc + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, xzr + sbc x11, x11, x5 + stp x8, x9, [sp, #96] + stp x10, x11, [sp, #112] + ldp x2, x3, [sp, #32] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #48] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + mov x5, #0xffffffff00000001 + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + ldp x3, x4, [sp, #96] + ldp x7, x8, [sp, #128] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #144] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #112] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #128] + stp x14, x0, [sp, #144] + ldp x3, x4, [sp, #96] + ldp x7, x8, [sp, #64] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #80] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #112] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #64] + stp x14, x0, [sp, #80] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #16] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #16] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #96] + stp x7, x8, [sp, #112] + ldp x3, x4, [sp, #160] + ldp x7, x8, [x16, #64] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #80] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #176] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #160] + stp x14, x0, [sp, #176] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #64] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #16] + ldp x4, x3, [sp, #80] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #16] + ldp x5, x6, [sp, #128] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #144] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldp x3, x4, [sp, #96] + ldp x7, x8, [sp, #192] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #208] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #112] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #96] + stp x14, x0, [sp, #112] + ldp x3, x4, [sp, #160] + ldp x7, x8, [x17, #64] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #80] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #176] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #160] + stp x14, x0, [sp, #176] + ldp x3, x4, [sp, #32] + ldp x7, x8, [sp, #128] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #144] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #48] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #128] + stp x14, x0, [sp, #144] + ldp x5, x6, [sp, #128] + ldp x4, x3, [sp, #96] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #144] + ldp x4, x3, [sp, #112] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldp x0, x1, [x16, #64] + ldp x2, x3, [x16, #80] + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne + ldp x4, x5, [x17, #64] + ldp x6, x7, [x17, #80] + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne + cmp x13, x12 + ldp x8, x9, [sp, #160] + csel x8, x0, x8, cc + csel x9, x1, x9, cc + csel x8, x4, x8, hi + csel x9, x5, x9, hi + ldp x10, x11, [sp, #176] + csel x10, x2, x10, cc + csel x11, x3, x11, cc + csel x10, x6, x10, hi + csel x11, x7, x11, hi + ldp x12, x13, [x16] + ldp x0, x1, [sp] + csel x0, x12, x0, cc + csel x1, x13, x1, cc + ldp x12, x13, [x17] + csel x0, x12, x0, hi + csel x1, x13, x1, hi + ldp x12, x13, [x16, #16] + ldp x2, x3, [sp, #16] + csel x2, x12, x2, cc + csel x3, x13, x3, cc + ldp x12, x13, [x17, #16] + csel x2, x12, x2, hi + csel x3, x13, x3, hi + ldp x12, x13, [x16, #32] + ldp x4, x5, [sp, #128] + csel x4, x12, x4, cc + csel x5, x13, x5, cc + ldp x12, x13, [x17, #32] + csel x4, x12, x4, hi + csel x5, x13, x5, hi + ldp x12, x13, [x16, #48] + ldp x6, x7, [sp, #144] + csel x6, x12, x6, cc + csel x7, x13, x7, cc + ldp x12, x13, [x17, #48] + csel x6, x12, x6, hi + csel x7, x13, x7, hi + stp x0, x1, [x15] + stp x2, x3, [x15, #16] + stp x4, x5, [x15, #32] + stp x6, x7, [x15, #48] + stp x8, x9, [x15, #64] + stp x10, x11, [x15, #80] + add sp, sp, #0xe0 + ret + +p256_scalarmul_alt_local_p256_montjdouble: + sub sp, sp, #0xc0 + mov x15, x0 + mov x16, x1 + ldp x2, x3, [x16, #64] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x16, #80] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + mov x5, #0xffffffff00000001 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mul x2, x8, x5 + umulh x8, x8, x5 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mul x2, x9, x5 + umulh x9, x9, x5 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mul x2, x10, x5 + umulh x10, x10, x5 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mul x2, x11, x5 + umulh x11, x11, x5 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + ldp x2, x3, [x16, #32] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x16, #48] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + mov x5, #0xffffffff00000001 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mul x2, x8, x5 + umulh x8, x8, x5 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mul x2, x9, x5 + umulh x9, x9, x5 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mul x2, x10, x5 + umulh x10, x10, x5 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mul x2, x11, x5 + umulh x11, x11, x5 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp, #32] + stp x10, x11, [sp, #48] + ldp x5, x6, [x16] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x16, #16] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #96] + stp x7, x8, [sp, #112] + ldp x5, x6, [x16] + ldp x4, x3, [sp] + adds x5, x5, x4 + adcs x6, x6, x3 + ldp x7, x8, [x16, #16] + ldp x4, x3, [sp, #16] + adcs x7, x7, x4 + adcs x8, x8, x3 + csetm x3, cs + subs x5, x5, x3 + and x1, x3, #0xffffffff + sbcs x6, x6, x1 + sbcs x7, x7, xzr + and x2, x3, #0xffffffff00000001 + sbc x8, x8, x2 + stp x5, x6, [sp, #64] + stp x7, x8, [sp, #80] + ldp x3, x4, [sp, #64] + ldp x7, x8, [sp, #96] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #112] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #80] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #96] + stp x14, x0, [sp, #112] + ldp x5, x6, [x16, #32] + ldp x4, x3, [x16, #64] + adds x5, x5, x4 + adcs x6, x6, x3 + ldp x7, x8, [x16, #48] + ldp x4, x3, [x16, #80] + adcs x7, x7, x4 + adcs x8, x8, x3 + adc x3, xzr, xzr + cmn x5, #0x1 + mov x4, #0xffffffff + sbcs xzr, x6, x4 + sbcs xzr, x7, xzr + mov x4, #0xffffffff00000001 + sbcs xzr, x8, x4 + adcs x3, x3, xzr + csetm x3, ne + subs x5, x5, x3 + and x4, x3, #0xffffffff + sbcs x6, x6, x4 + sbcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + sbc x8, x8, x4 + stp x5, x6, [sp, #64] + stp x7, x8, [sp, #80] + ldp x3, x4, [x16] + ldp x7, x8, [sp, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [x16, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #128] + stp x14, x0, [sp, #144] + ldp x2, x3, [sp, #96] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #112] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + mov x5, #0xffffffff00000001 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mul x2, x8, x5 + umulh x8, x8, x5 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mul x2, x9, x5 + umulh x9, x9, x5 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mul x2, x10, x5 + umulh x10, x10, x5 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mul x2, x11, x5 + umulh x11, x11, x5 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp, #160] + stp x10, x11, [sp, #176] + ldp x2, x3, [sp, #64] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #80] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + mov x5, #0xffffffff00000001 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mul x2, x8, x5 + umulh x8, x8, x5 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mul x2, x9, x5 + umulh x9, x9, x5 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mul x2, x10, x5 + umulh x10, x10, x5 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mul x2, x11, x5 + umulh x11, x11, x5 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp, #64] + stp x10, x11, [sp, #80] + mov x1, #0x9 + mov x2, #0xffffffffffffffff + ldp x9, x10, [sp, #160] + subs x9, x2, x9 + mov x2, #0xffffffff + sbcs x10, x2, x10 + ldp x11, x12, [sp, #176] + ngcs x11, x11 + mov x2, #0xffffffff00000001 + sbc x12, x2, x12 + mul x3, x1, x9 + mul x4, x1, x10 + mul x5, x1, x11 + mul x6, x1, x12 + umulh x9, x1, x9 + umulh x10, x1, x10 + umulh x11, x1, x11 + umulh x7, x1, x12 + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, xzr + mov x1, #0xc + ldp x9, x10, [sp, #128] + mul x8, x9, x1 + umulh x9, x9, x1 + adds x3, x3, x8 + mul x8, x10, x1 + umulh x10, x10, x1 + adcs x4, x4, x8 + ldp x11, x12, [sp, #144] + mul x8, x11, x1 + umulh x11, x11, x1 + adcs x5, x5, x8 + mul x8, x12, x1 + umulh x12, x12, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x8, x7, #0x1 + lsl x10, x8, #32 + adds x6, x6, x10 + adc x7, x7, xzr + neg x9, x8 + sub x10, x10, #0x1 + subs x3, x3, x9 + sbcs x4, x4, x10 + sbcs x5, x5, xzr + sbcs x6, x6, x8 + sbc x8, x7, x8 + adds x3, x3, x8 + and x9, x8, #0xffffffff + adcs x4, x4, x9 + adcs x5, x5, xzr + neg x10, x9 + adc x6, x6, x10 + stp x3, x4, [sp, #160] + stp x5, x6, [sp, #176] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #64] + stp x7, x8, [sp, #80] + ldp x2, x3, [sp, #32] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #48] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + mov x5, #0xffffffff00000001 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mul x2, x8, x5 + umulh x8, x8, x5 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mul x2, x9, x5 + umulh x9, x9, x5 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mul x2, x10, x5 + umulh x10, x10, x5 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mul x2, x11, x5 + umulh x11, x11, x5 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + ldp x3, x4, [sp, #160] + ldp x7, x8, [sp, #96] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #112] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #176] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #96] + stp x14, x0, [sp, #112] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #32] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #48] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [x15, #64] + stp x7, x8, [x15, #80] + ldp x1, x2, [sp, #128] + lsl x0, x1, #2 + ldp x6, x7, [sp, #160] + subs x0, x0, x6 + extr x1, x2, x1, #62 + sbcs x1, x1, x7 + ldp x3, x4, [sp, #144] + extr x2, x3, x2, #62 + ldp x6, x7, [sp, #176] + sbcs x2, x2, x6 + extr x3, x4, x3, #62 + sbcs x3, x3, x7 + lsr x4, x4, #62 + sbc x4, x4, xzr + add x5, x4, #0x1 + lsl x8, x5, #32 + negs x6, x8 + ngcs x7, xzr + sbc x8, x8, x5 + adds x0, x0, x5 + adcs x1, x1, x6 + adcs x2, x2, x7 + adcs x3, x3, x8 + csetm x5, cc + adds x0, x0, x5 + and x6, x5, #0xffffffff + adcs x1, x1, x6 + adcs x2, x2, xzr + neg x7, x6 + adc x3, x3, x7 + stp x0, x1, [x15] + stp x2, x3, [x15, #16] + mov x1, #0x8 + mov x2, #0xffffffffffffffff + ldp x9, x10, [sp] + subs x9, x2, x9 + mov x2, #0xffffffff + sbcs x10, x2, x10 + ldp x11, x12, [sp, #16] + ngcs x11, x11 + mov x2, #0xffffffff00000001 + sbc x12, x2, x12 + lsl x3, x9, #3 + extr x4, x10, x9, #61 + extr x5, x11, x10, #61 + extr x6, x12, x11, #61 + lsr x7, x12, #61 + mov x1, #0x3 + ldp x9, x10, [sp, #96] + mul x8, x9, x1 + umulh x9, x9, x1 + adds x3, x3, x8 + mul x8, x10, x1 + umulh x10, x10, x1 + adcs x4, x4, x8 + ldp x11, x12, [sp, #112] + mul x8, x11, x1 + umulh x11, x11, x1 + adcs x5, x5, x8 + mul x8, x12, x1 + umulh x12, x12, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x8, x7, #0x1 + lsl x10, x8, #32 + adds x6, x6, x10 + adc x7, x7, xzr + neg x9, x8 + sub x10, x10, #0x1 + subs x3, x3, x9 + sbcs x4, x4, x10 + sbcs x5, x5, xzr + sbcs x6, x6, x8 + sbc x8, x7, x8 + adds x3, x3, x8 + and x9, x8, #0xffffffff + adcs x4, x4, x9 + adcs x5, x5, xzr + neg x10, x9 + adc x6, x6, x10 + stp x3, x4, [x15, #32] + stp x5, x6, [x15, #48] + add sp, sp, #0xc0 + ret + +p256_scalarmul_alt_local_p256_montjmixadd: + sub sp, sp, #0xc0 + mov x15, x0 + mov x16, x1 + mov x17, x2 + ldp x2, x3, [x16, #64] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x16, #80] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + mov x2, #0xffffffffffffffff + csel x2, xzr, x2, cc + mov x3, #0xffffffff + csel x3, xzr, x3, cc + mov x5, #0xffffffff00000001 + csel x5, xzr, x5, cc + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, xzr + sbc x11, x11, x5 + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + ldp x3, x4, [x16, #64] + ldp x7, x8, [x17, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [x16, #80] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #32] + stp x14, x0, [sp, #48] + ldp x3, x4, [sp] + ldp x7, x8, [x17] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #16] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #64] + stp x14, x0, [sp, #80] + ldp x3, x4, [sp] + ldp x7, x8, [sp, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #32] + stp x14, x0, [sp, #48] + ldp x5, x6, [sp, #64] + ldp x4, x3, [x16] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [x16, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #160] + stp x7, x8, [sp, #176] + ldp x5, x6, [sp, #32] + ldp x4, x3, [x16, #32] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #48] + ldp x4, x3, [x16, #48] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #32] + stp x7, x8, [sp, #48] + ldp x2, x3, [sp, #160] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #176] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + mov x2, #0xffffffffffffffff + csel x2, xzr, x2, cc + mov x3, #0xffffffff + csel x3, xzr, x3, cc + mov x5, #0xffffffff00000001 + csel x5, xzr, x5, cc + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, xzr + sbc x11, x11, x5 + stp x8, x9, [sp, #96] + stp x10, x11, [sp, #112] + ldp x2, x3, [sp, #32] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #48] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + mov x5, #0xffffffff00000001 + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + ldp x3, x4, [sp, #96] + ldp x7, x8, [x16] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #16] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #112] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #128] + stp x14, x0, [sp, #144] + ldp x3, x4, [sp, #96] + ldp x7, x8, [sp, #64] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #80] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #112] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #64] + stp x14, x0, [sp, #80] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #16] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #16] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #96] + stp x7, x8, [sp, #112] + ldp x3, x4, [sp, #160] + ldp x7, x8, [x16, #64] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #80] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #176] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #160] + stp x14, x0, [sp, #176] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #64] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #16] + ldp x4, x3, [sp, #80] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #16] + ldp x5, x6, [sp, #128] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #144] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldp x3, x4, [sp, #96] + ldp x7, x8, [x16, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #112] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #96] + stp x14, x0, [sp, #112] + ldp x3, x4, [sp, #32] + ldp x7, x8, [sp, #128] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #144] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #48] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #128] + stp x14, x0, [sp, #144] + ldp x5, x6, [sp, #128] + ldp x4, x3, [sp, #96] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #144] + ldp x4, x3, [sp, #112] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldp x0, x1, [x16, #64] + ldp x2, x3, [x16, #80] + orr x4, x0, x1 + orr x5, x2, x3 + orr x4, x4, x5 + cmp x4, xzr + ldp x0, x1, [sp] + ldp x12, x13, [x17] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x2, x3, [sp, #16] + ldp x12, x13, [x17, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x4, x5, [sp, #128] + ldp x12, x13, [x17, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x6, x7, [sp, #144] + ldp x12, x13, [x17, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x8, x9, [sp, #160] + mov x12, #0x1 + mov x13, #0xffffffff00000000 + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x10, x11, [sp, #176] + mov x12, #0xffffffffffffffff + mov x13, #0xfffffffe + csel x10, x10, x12, ne + csel x11, x11, x13, ne + stp x0, x1, [x15] + stp x2, x3, [x15, #16] + stp x4, x5, [x15, #32] + stp x6, x7, [x15, #48] + stp x8, x9, [x15, #64] + stp x10, x11, [x15, #80] + add sp, sp, #0xc0 + ret + + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmulbase.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmulbase.S new file mode 100644 index 00000000000..29b6c9ed892 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmulbase.S @@ -0,0 +1,3751 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Scalar multiplication for precomputed point on NIST curve P-256 +// Input scalar[4], blocksize, table[]; output res[8] +// +// extern void p256_scalarmulbase +// (uint64_t res[static 8], +// uint64_t scalar[static 4], +// uint64_t blocksize, +// uint64_t *table); +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-256, the input argument "table" is expected to be a table of +// multiples of the point P in Montgomery-affine form, with each block +// corresponding to "blocksize" bits of the scalar as follows, where +// B = 2^{blocksize-1} (e.g. B = 8 for blocksize = 4): +// +// For each i,j with blocksize * i <= 256 and 1 <= j <= B +// the multiple 2^{blocksize * i} * j * P is stored at +// tab[8 * (B * i + (j - 1))], considered as uint64_t pointers +// or tab + 64 * (B * i + (j - 1)) as byte pointers. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = blocksize, X3 = table +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmulbase) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmulbase) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Safe copies of inputs and additional variables, with some aliasing + +#define res x19 +#define blocksize x20 +#define table x21 +#define i x22 +#define bf x23 +#define cf x24 +#define j x25 + +// Intermediate variables on the stack. The last z2, z3 values can +// safely be overlaid on "nacc", which is no longer needed at the end. +// Uppercase syntactic variants make x86_att version simpler to generate + +#define rscalar sp, #(0*NUMSIZE) +#define acc sp, #(1*NUMSIZE) +#define nacc sp, #(4*NUMSIZE) +#define tabent sp, #(7*NUMSIZE) + +#define z2 sp, #(4*NUMSIZE) +#define z3 sp, #(5*NUMSIZE) + +#define NSPACE #(9*NUMSIZE) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(p256_scalarmulbase): + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x30, [sp, #-16]! + sub sp, sp, NSPACE + +// Preserve the input arguments except the scalar, since that gets absorbed +// immediately. The "table" value subsequently gets shifted up each iteration +// of the loop, while "res" and "blocksize" are static throughout. + + mov res, x0 + mov blocksize, x2 + mov table, x3 + +// Load the digits of group order n_256 = [x15;x14;x13;x12] + + movbig(x12, #0xf3b9, #0xcac2, #0xfc63, #0x2551) + movbig(x13, #0xbce6, #0xfaad, #0xa717, #0x9e84) + mov x14, #0xffffffffffffffff + mov x15, #0xffffffff00000000 + +// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256 +// Store it to "rscalar" (reduced scalar) + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + + subs x6, x2, x12 + sbcs x7, x3, x13 + sbcs x8, x4, x14 + sbcs x9, x5, x15 + + csel x2, x2, x6, cc + csel x3, x3, x7, cc + csel x4, x4, x8, cc + csel x5, x5, x9, cc + + stp x2, x3, [rscalar] + stp x4, x5, [rscalar+16] + +// Initialize the accumulator to all zeros and the "carry flag" cf to 0 + + stp xzr, xzr, [acc] + stp xzr, xzr, [acc+16] + stp xzr, xzr, [acc+32] + stp xzr, xzr, [acc+48] + stp xzr, xzr, [acc+64] + stp xzr, xzr, [acc+80] + mov cf, xzr + +// Main loop over {i >= 0 | blocksize * i <= 256}. Note the non-strict +// inequality, to allow top carry for any choices of blocksize. + + mov i, xzr + +p256_scalarmulbase_loop: + +// The next raw bitfield is bf = bitfield(blocksize * i,blocksize) + cf, +// adding in the deferred carry cf. We then shift the whole scalar right +// by blocksize so we can keep picking bitfield(0,blocksize). + + ldp x0, x1, [rscalar] + ldp x2, x3, [rscalar+16] + + mov x4, #1 + lsl x4, x4, blocksize + sub x4, x4, #1 + and x4, x4, x0 + add bf, x4, cf + + neg x8, blocksize + + lsl x5, x1, x8 + + lsr x0, x0, blocksize + orr x0, x0, x5 + + lsl x6, x2, x8 + lsr x1, x1, blocksize + orr x1, x1, x6 + + lsl x7, x3, x8 + lsr x2, x2, blocksize + orr x2, x2, x7 + + lsr x3, x3, blocksize + + stp x0, x1, [rscalar] + stp x2, x3, [rscalar+16] + +// Now if bf <= B we just select entry j, unnegated and set cf = 0. +// If bf > B we set j = 2 * B - bf and negate the j'th entry, setting cf = 1. +// In either case we ultimately add bf, in the latter case with deferred +// carry as 2 * B - (2 * B - bf) = bf. + + mov x0, #1 + lsl x1, x0, blocksize + lsr x0, x1, #1 + + sub x2, x1, bf + + cmp x0, bf + cset cf, cc + csel j, x2, bf, cc + +// Load table entry j - 1 for nonzero j in constant-time style. + + mov x16, #1 + lsl x16, x16, blocksize + lsr x16, x16, #1 + mov x17, j + +p256_scalarmulbase_tabloop: + ldp x8, x9, [table] + ldp x10, x11, [table, #16] + ldp x12, x13, [table, #32] + ldp x14, x15, [table, #48] + + subs x17, x17, #1 + csel x0, x8, x0, eq + csel x1, x9, x1, eq + csel x2, x10, x2, eq + csel x3, x11, x3, eq + csel x4, x12, x4, eq + csel x5, x13, x5, eq + csel x6, x14, x6, eq + csel x7, x15, x7, eq + + add table, table, #64 + + sub x16, x16, #1 + cbnz x16, p256_scalarmulbase_tabloop + +// Before storing back, optionally negate the y coordinate of the table entry + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + + mov x0, 0xffffffffffffffff + subs x0, x0, x4 + mov x1, 0x00000000ffffffff + sbcs x1, x1, x5 + mov x3, 0xffffffff00000001 + sbcs x2, xzr, x6 + sbc x3, x3, x7 + + cmp cf, xzr + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + +// Add the adjusted table point to the accumulator + + add x0, nacc + add x1, acc + add x2, tabent + bl p256_scalarmulbase_local_p256_montjmixadd + +// However, only commit that update to the accumulator if j is nonzero, +// because the mixed addition function does not handle this case directly, +// and in any case we didn't choose the table entry appropriately. + + cmp j, xzr + ldp x0, x1, [acc] + ldp x12, x13, [nacc] + csel x0, x12, x0, ne + csel x1, x13, x1, ne + + ldp x2, x3, [acc+16] + ldp x12, x13, [nacc+16] + csel x2, x12, x2, ne + csel x3, x13, x3, ne + + ldp x4, x5, [acc+32] + ldp x12, x13, [nacc+32] + csel x4, x12, x4, ne + csel x5, x13, x5, ne + + ldp x6, x7, [acc+48] + ldp x12, x13, [nacc+48] + csel x6, x12, x6, ne + csel x7, x13, x7, ne + + ldp x8, x9, [acc+64] + ldp x12, x13, [nacc+64] + csel x8, x12, x8, ne + csel x9, x13, x9, ne + + ldp x10, x11, [acc+80] + ldp x12, x13, [nacc+80] + csel x10, x12, x10, ne + csel x11, x13, x11, ne + + stp x0, x1, [acc] + stp x2, x3, [acc+16] + stp x4, x5, [acc+32] + stp x6, x7, [acc+48] + stp x8, x9, [acc+64] + stp x10, x11, [acc+80] + +// Loop while blocksize * i <= 256 + + add i, i, #1 + mul x0, blocksize, i + cmp x0, #257 + bcc p256_scalarmulbase_loop + +// That's the end of the main loop, and we just need to translate +// back from the Jacobian representation to affine. First of all, +// let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form + + add x0, z2 + add x1, acc+64 + bl p256_scalarmulbase_local_montsqr_p256 + + add x0, z3 + add x1, acc+64 + add x2, z2 + bl p256_scalarmulbase_local_montmul_p256 + + add x0, z2 + add x1, z3 + bl p256_scalarmulbase_local_demont_p256 + + add x0, z3 + add x1, z2 + bl p256_scalarmulbase_local_inv_p256 + + add x0, z2 + add x1, acc+64 + add x2, z3 + bl p256_scalarmulbase_local_montmul_p256 + +// Convert back from Jacobian (X,Y,Z) |-> (X/Z^2, Y/Z^3) + + mov x0, res + add x1, acc + add x2, z2 + bl p256_scalarmulbase_local_montmul_p256 + + add x0, res, #32 + add x1, acc+32 + add x2, z3 + bl p256_scalarmulbase_local_montmul_p256 + +// Restore stack and registers and return + + add sp, sp, NSPACE + ldp x25, x30, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// Local copies of subroutines, complete clones at the moment + +p256_scalarmulbase_local_demont_p256: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + lsl x7, x2, #32 + subs x8, x2, x7 + lsr x6, x2, #32 + sbc x2, x2, x6 + adds x3, x3, x7 + adcs x4, x4, x6 + adcs x5, x5, x8 + adc x2, x2, xzr + lsl x7, x3, #32 + subs x8, x3, x7 + lsr x6, x3, #32 + sbc x3, x3, x6 + adds x4, x4, x7 + adcs x5, x5, x6 + adcs x2, x2, x8 + adc x3, x3, xzr + lsl x7, x4, #32 + subs x8, x4, x7 + lsr x6, x4, #32 + sbc x4, x4, x6 + adds x5, x5, x7 + adcs x2, x2, x6 + adcs x3, x3, x8 + adc x4, x4, xzr + lsl x7, x5, #32 + subs x8, x5, x7 + lsr x6, x5, #32 + sbc x5, x5, x6 + adds x2, x2, x7 + adcs x3, x3, x6 + adcs x4, x4, x8 + adc x5, x5, xzr + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + +p256_scalarmulbase_local_inv_p256: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + sub sp, sp, #0xa0 + mov x20, x0 + mov x10, #0xffffffffffffffff + mov x11, #0xffffffff + mov x13, #0xffffffff00000001 + stp x10, x11, [sp] + stp xzr, x13, [sp, #16] + str xzr, [sp, #32] + ldp x2, x3, [x1] + subs x10, x2, x10 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #16] + sbcs x12, x4, xzr + sbcs x13, x5, x13 + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + stp x2, x3, [sp, #48] + stp x4, x5, [sp, #64] + str xzr, [sp, #80] + stp xzr, xzr, [sp, #96] + stp xzr, xzr, [sp, #112] + mov x10, #0x4000000000000 + stp x10, xzr, [sp, #128] + stp xzr, xzr, [sp, #144] + mov x21, #0xa + mov x22, #0x1 + b p256_scalarmulbase_inv_midloop +p256_scalarmulbase_inv_loop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #48] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #64] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #56] + ldr x7, [sp, #24] + eor x1, x7, x14 + ldr x23, [sp, #32] + eor x3, x23, x14 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #72] + eor x1, x8, x15 + ldr x24, [sp, #80] + eor x0, x24, x15 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + asr x3, x3, #59 + str x3, [sp, #32] + eor x1, x7, x16 + eor x5, x23, x16 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + eor x0, x24, x17 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #64] + extr x2, x5, x2, #59 + str x2, [sp, #72] + asr x5, x5, #59 + str x5, [sp, #80] + ldr x7, [sp, #96] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #128] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #96] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #128] + adc x3, x3, x1 + ldr x7, [sp, #104] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #136] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #104] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #136] + adc x4, x4, x1 + ldr x7, [sp, #112] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #144] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #112] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #144] + adc x2, x2, x1 + ldr x7, [sp, #120] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #152] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + ldp x0, x1, [sp, #96] + ldr x6, [sp, #112] + mov x14, #0xe000000000000000 + adds x0, x0, x14 + sbcs x1, x1, xzr + mov x11, #0x1fffffff + adcs x6, x6, x11 + mov x10, #0x2000000000000000 + adcs x5, x5, x10 + mov x14, #0x1fffffffe0000000 + adc x3, x3, x14 + lsl x11, x0, #32 + subs x14, x0, x11 + lsr x10, x0, #32 + sbc x0, x0, x10 + adds x1, x1, x11 + adcs x6, x6, x10 + adcs x5, x5, x14 + adcs x3, x3, x0 + mov x14, #0xffffffffffffffff + mov x11, #0xffffffff + mov x10, #0xffffffff00000001 + csel x14, x14, xzr, cs + csel x11, x11, xzr, cs + csel x10, x10, xzr, cs + subs x1, x1, x14 + sbcs x6, x6, x11 + sbcs x5, x5, xzr + sbc x3, x3, x10 + stp x1, x6, [sp, #96] + stp x5, x3, [sp, #112] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + ldp x0, x1, [sp, #128] + ldr x3, [sp, #144] + mov x14, #0xe000000000000000 + adds x0, x0, x14 + sbcs x1, x1, xzr + mov x11, #0x1fffffff + adcs x3, x3, x11 + mov x10, #0x2000000000000000 + adcs x2, x2, x10 + mov x14, #0x1fffffffe0000000 + adc x5, x5, x14 + lsl x11, x0, #32 + subs x14, x0, x11 + lsr x10, x0, #32 + sbc x0, x0, x10 + adds x1, x1, x11 + adcs x3, x3, x10 + adcs x2, x2, x14 + adcs x5, x5, x0 + mov x14, #0xffffffffffffffff + mov x11, #0xffffffff + mov x10, #0xffffffff00000001 + csel x14, x14, xzr, cs + csel x11, x11, xzr, cs + csel x10, x10, xzr, cs + subs x1, x1, x14 + sbcs x3, x3, x11 + sbcs x2, x2, xzr + sbc x5, x5, x10 + stp x1, x3, [sp, #128] + stp x2, x5, [sp, #144] +p256_scalarmulbase_inv_midloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #48] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + bne p256_scalarmulbase_inv_loop + ldr x0, [sp] + ldr x1, [sp, #48] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #96] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #128] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #96] + adc x2, x2, x1 + ldr x7, [sp, #104] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #136] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #104] + adc x6, x6, x1 + ldr x7, [sp, #112] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #144] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #112] + adc x5, x5, x1 + ldr x7, [sp, #120] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #152] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + ldp x0, x1, [sp, #96] + ldr x2, [sp, #112] + mov x14, #0xe000000000000000 + adds x0, x0, x14 + sbcs x1, x1, xzr + mov x11, #0x1fffffff + adcs x2, x2, x11 + mov x10, #0x2000000000000000 + adcs x5, x5, x10 + mov x14, #0x1fffffffe0000000 + adc x3, x3, x14 + lsl x11, x0, #32 + subs x14, x0, x11 + lsr x10, x0, #32 + sbc x0, x0, x10 + adds x1, x1, x11 + adcs x2, x2, x10 + adcs x5, x5, x14 + adcs x3, x3, x0 + mov x14, #0xffffffffffffffff + mov x11, #0xffffffff + mov x10, #0xffffffff00000001 + csel x14, x14, xzr, cs + csel x11, x11, xzr, cs + csel x10, x10, xzr, cs + subs x1, x1, x14 + sbcs x2, x2, x11 + sbcs x5, x5, xzr + sbc x3, x3, x10 + mov x10, #0xffffffffffffffff + subs x10, x1, x10 + mov x11, #0xffffffff + sbcs x11, x2, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x5, xzr + sbcs x13, x3, x13 + csel x10, x1, x10, cc + csel x11, x2, x11, cc + csel x12, x5, x12, cc + csel x13, x3, x13, cc + stp x10, x11, [x20] + stp x12, x13, [x20, #16] + add sp, sp, #0xa0 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +p256_scalarmulbase_local_montmul_p256: + ldr q20, [x2] + ldp x7, x17, [x1] + ldr q0, [x1] + ldp x6, x10, [x2] + ldp x11, x15, [x1, #16] + rev64 v16.4S, v20.4S + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4S, v16.4S, v0.4S + umulh x12, x17, x10 + uzp1 v28.4S, v20.4S, v0.4S + subs x14, x11, x7 + ldr q20, [x2, #16] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2D, v16.4S + umulh x4, x7, x6 + uzp1 v21.4S, v0.4S, v0.4S + cneg x11, x8, cc + shl v17.2D, v27.2D, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2D, v21.2S, v28.2S + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x1, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2S, v20.2D + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4S, v20.4S, v20.4S + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2S, v28.2D + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x2, #16] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x9, x3, x13 + adcs x3, x8, x7 + umulh x8, x14, x11 + umull v21.2D, v0.2S, v1.2S + adcs x12, x10, x12 + umull v3.2D, v0.2S, v16.2S + adc x15, x15, xzr + rev64 v24.4S, v20.4S + stp x12, x15, [x0, #16] + movi v2.2D, #0x00000000ffffffff + mul x10, x14, x11 + mul v4.4S, v24.4S, v28.4S + subs x13, x14, x5 + uzp2 v19.4S, v28.4S, v28.4S + csetm x15, cc + usra v3.2D, v21.2D, #32 + mul x7, x5, x1 + umull v21.2D, v19.2S, v16.2S + cneg x13, x13, cc + uaddlp v5.2D, v4.4S + subs x11, x1, x11 + and v16.16B, v3.16B, v2.16B + umulh x5, x5, x1 + shl v24.2D, v5.2D, #32 + cneg x11, x11, cc + umlal v16.2D, v19.2S, v1.2S + cinv x12, x15, cc + umlal v24.2D, v0.2S, v1.2S + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + stp x9, x3, [x0] + usra v21.2D, v3.2D, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2D, v16.2D, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + ldp x15, x8, [x0] + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + ldp x9, x13, [x0, #16] + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x15 + adcs x15, x16, x8 + eor x5, x17, x4 + adcs x9, x1, x9 + eor x1, x10, x5 + adcs x16, x2, x13 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [x0] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [x0, #16] + ret + +p256_scalarmulbase_local_montsqr_p256: + ldr q19, [x1] + ldp x9, x13, [x1] + ldr q23, [x1, #16] + ldr q0, [x1] + ldp x1, x10, [x1, #16] + uzp2 v29.4S, v19.4S, v19.4S + xtn v4.2S, v19.2D + umulh x8, x9, x13 + rev64 v20.4S, v23.4S + umull v16.2D, v19.2S, v19.2S + umull v1.2D, v29.2S, v4.2S + mul v20.4S, v20.4S, v0.4S + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2D, v19.4S, v19.4S + mov x4, v16.d[0] + uzp1 v17.4S, v23.4S, v0.4S + uaddlp v19.2D, v20.4S + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4S, v0.4S, v0.4S + shl v19.2D, v19.2D, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2D, v20.2S, v17.2S + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x2, x5, x2, cs + stp x14, x12, [x0, #16] + stp x16, x2, [x0] + ret + +p256_scalarmulbase_local_p256_montjmixadd: + stp x19, x20, [sp, #-16]! + sub sp, sp, #0xc0 + mov x17, x0 + mov x19, x1 + mov x20, x2 + ldp x2, x3, [x19, #64] + ldp x4, x5, [x19, #80] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x0, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x0, x0, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x0, x0, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + subs x13, x15, x12 + lsr x11, x15, #32 + sbc x15, x15, x11 + adds x16, x16, x12 + adcs x0, x0, x11 + adcs x1, x1, x13 + adc x15, x15, xzr + lsl x12, x16, #32 + subs x13, x16, x12 + lsr x11, x16, #32 + sbc x16, x16, x11 + adds x0, x0, x12 + adcs x1, x1, x11 + adcs x15, x15, x13 + adc x16, x16, xzr + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, cc + csetm x13, cc + subs x12, x5, x4 + cneg x12, x12, cc + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, cc + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x0 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + subs x13, x6, x12 + lsr x11, x6, #32 + sbc x6, x6, x11 + adds x7, x7, x12 + adcs x8, x8, x11 + adcs x9, x9, x13 + adcs x10, x10, x6 + adc x6, xzr, xzr + lsl x12, x7, #32 + subs x13, x7, x12 + lsr x11, x7, #32 + sbc x7, x7, x11 + adds x8, x8, x12 + adcs x9, x9, x11 + adcs x10, x10, x13 + adcs x6, x6, x7 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #0xffffffff + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, cs + csel x9, x11, x9, cs + csel x10, x12, x10, cs + csel x6, x13, x6, cs + stp x8, x9, [sp] + stp x10, x6, [sp, #16] + ldp x3, x4, [x19, #64] + ldp x5, x6, [x19, #80] + ldp x7, x8, [x20, #32] + ldp x9, x10, [x20, #48] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #32] + stp x11, x12, [sp, #48] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #32] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #48] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #32] + stp x3, x4, [sp, #48] + ldp x3, x4, [sp] + ldp x5, x6, [sp, #16] + ldp x7, x8, [x20] + ldp x9, x10, [x20, #16] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #64] + stp x11, x12, [sp, #80] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #64] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #80] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #64] + stp x3, x4, [sp, #80] + ldp x3, x4, [sp] + ldp x5, x6, [sp, #16] + ldp x7, x8, [sp, #32] + ldp x9, x10, [sp, #48] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #32] + stp x11, x12, [sp, #48] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #32] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #48] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #32] + stp x3, x4, [sp, #48] + ldp x5, x6, [sp, #64] + ldp x4, x3, [x19] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [x19, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #160] + stp x7, x8, [sp, #176] + ldp x5, x6, [sp, #32] + ldp x4, x3, [x19, #32] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #48] + ldp x4, x3, [x19, #48] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #32] + stp x7, x8, [sp, #48] + ldp x2, x3, [sp, #160] + ldp x4, x5, [sp, #176] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x0, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x0, x0, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x0, x0, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + subs x13, x15, x12 + lsr x11, x15, #32 + sbc x15, x15, x11 + adds x16, x16, x12 + adcs x0, x0, x11 + adcs x1, x1, x13 + adc x15, x15, xzr + lsl x12, x16, #32 + subs x13, x16, x12 + lsr x11, x16, #32 + sbc x16, x16, x11 + adds x0, x0, x12 + adcs x1, x1, x11 + adcs x15, x15, x13 + adc x16, x16, xzr + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, cc + csetm x13, cc + subs x12, x5, x4 + cneg x12, x12, cc + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, cc + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x0 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + subs x13, x6, x12 + lsr x11, x6, #32 + sbc x6, x6, x11 + adds x7, x7, x12 + adcs x8, x8, x11 + adcs x9, x9, x13 + adcs x10, x10, x6 + adc x6, xzr, xzr + lsl x12, x7, #32 + subs x13, x7, x12 + lsr x11, x7, #32 + sbc x7, x7, x11 + adds x8, x8, x12 + adcs x9, x9, x11 + adcs x10, x10, x13 + adcs x6, x6, x7 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #0xffffffff + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, cs + csel x9, x11, x9, cs + csel x10, x12, x10, cs + csel x6, x13, x6, cs + stp x8, x9, [sp, #96] + stp x10, x6, [sp, #112] + ldp x2, x3, [sp, #32] + ldp x4, x5, [sp, #48] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x0, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x0, x0, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x0, x0, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + subs x13, x15, x12 + lsr x11, x15, #32 + sbc x15, x15, x11 + adds x16, x16, x12 + adcs x0, x0, x11 + adcs x1, x1, x13 + adc x15, x15, xzr + lsl x12, x16, #32 + subs x13, x16, x12 + lsr x11, x16, #32 + sbc x16, x16, x11 + adds x0, x0, x12 + adcs x1, x1, x11 + adcs x15, x15, x13 + adc x16, x16, xzr + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, cc + csetm x13, cc + subs x12, x5, x4 + cneg x12, x12, cc + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, cc + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x0 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + subs x13, x6, x12 + lsr x11, x6, #32 + sbc x6, x6, x11 + adds x7, x7, x12 + adcs x8, x8, x11 + adcs x9, x9, x13 + adcs x10, x10, x6 + adc x6, xzr, xzr + lsl x12, x7, #32 + subs x13, x7, x12 + lsr x11, x7, #32 + sbc x7, x7, x11 + adds x8, x8, x12 + adcs x9, x9, x11 + adcs x10, x10, x13 + adcs x6, x6, x7 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #0xffffffff + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, cs + csel x9, x11, x9, cs + csel x10, x12, x10, cs + csel x6, x13, x6, cs + stp x8, x9, [sp] + stp x10, x6, [sp, #16] + ldp x3, x4, [sp, #96] + ldp x5, x6, [sp, #112] + ldp x7, x8, [x19] + ldp x9, x10, [x19, #16] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #128] + stp x11, x12, [sp, #144] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #128] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #144] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #128] + stp x3, x4, [sp, #144] + ldp x3, x4, [sp, #96] + ldp x5, x6, [sp, #112] + ldp x7, x8, [sp, #64] + ldp x9, x10, [sp, #80] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #64] + stp x11, x12, [sp, #80] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #64] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #80] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #64] + stp x3, x4, [sp, #80] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #16] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #16] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #96] + stp x7, x8, [sp, #112] + ldp x3, x4, [sp, #160] + ldp x5, x6, [sp, #176] + ldp x7, x8, [x19, #64] + ldp x9, x10, [x19, #80] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #160] + stp x11, x12, [sp, #176] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #160] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #176] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #160] + stp x3, x4, [sp, #176] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #64] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #16] + ldp x4, x3, [sp, #80] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #16] + ldp x5, x6, [sp, #128] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #144] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldp x3, x4, [sp, #96] + ldp x5, x6, [sp, #112] + ldp x7, x8, [x19, #32] + ldp x9, x10, [x19, #48] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #96] + stp x11, x12, [sp, #112] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #96] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #112] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #96] + stp x3, x4, [sp, #112] + ldp x3, x4, [sp, #32] + ldp x5, x6, [sp, #48] + ldp x7, x8, [sp, #128] + ldp x9, x10, [sp, #144] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, cc + csetm x1, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + stp x13, x14, [sp, #128] + stp x11, x12, [sp, #144] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, cc + csetm x1, cc + subs x0, x10, x9 + cneg x0, x0, cc + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, cc + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #128] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #144] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, cc + csetm x4, cc + subs x0, x8, x7 + cneg x0, x0, cc + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, cc + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x0, x11, #32 + subs x1, x11, x0 + lsr x16, x11, #32 + sbc x11, x11, x16 + adds x12, x12, x0 + adcs x13, x13, x16 + adcs x14, x14, x1 + adc x11, x11, xzr + lsl x0, x12, #32 + subs x1, x12, x0 + lsr x16, x12, #32 + sbc x12, x12, x16 + adds x13, x13, x0 + adcs x14, x14, x16 + adcs x11, x11, x1 + adc x12, x12, xzr + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x16, x2, #32 + adds x4, x4, x16 + adc x10, x10, xzr + neg x15, x2 + sub x16, x16, #0x1 + subs x13, x13, x15 + sbcs x14, x14, x16 + sbcs x3, x3, xzr + sbcs x4, x4, x2 + sbcs x7, x10, x2 + adds x13, x13, x7 + mov x10, #0xffffffff + and x10, x10, x7 + adcs x14, x14, x10 + adcs x3, x3, xzr + mov x10, #0xffffffff00000001 + and x10, x10, x7 + adc x4, x4, x10 + stp x13, x14, [sp, #128] + stp x3, x4, [sp, #144] + ldp x5, x6, [sp, #128] + ldp x4, x3, [sp, #96] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #144] + ldp x4, x3, [sp, #112] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldp x0, x1, [x19, #64] + ldp x2, x3, [x19, #80] + orr x4, x0, x1 + orr x5, x2, x3 + orr x4, x4, x5 + cmp x4, xzr + ldp x0, x1, [sp] + ldp x12, x13, [x20] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x2, x3, [sp, #16] + ldp x12, x13, [x20, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x4, x5, [sp, #128] + ldp x12, x13, [x20, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x6, x7, [sp, #144] + ldp x12, x13, [x20, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x8, x9, [sp, #160] + mov x12, #0x1 + mov x13, #0xffffffff00000000 + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x10, x11, [sp, #176] + mov x12, #0xffffffffffffffff + mov x13, #0xfffffffe + csel x10, x10, x12, ne + csel x11, x11, x13, ne + stp x0, x1, [x17] + stp x2, x3, [x17, #16] + stp x4, x5, [x17, #32] + stp x6, x7, [x17, #48] + stp x8, x9, [x17, #64] + stp x10, x11, [x17, #80] + add sp, sp, #0xc0 + ldp x19, x20, [sp], #16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmulbase_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmulbase_alt.S new file mode 100644 index 00000000000..cb4a278d446 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/p256_scalarmulbase_alt.S @@ -0,0 +1,3026 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Scalar multiplication for precomputed point on NIST curve P-256 +// Input scalar[4], blocksize, table[]; output res[8] +// +// extern void p256_scalarmulbase_alt +// (uint64_t res[static 8], +// uint64_t scalar[static 4], +// uint64_t blocksize, +// uint64_t *table); +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-256, the input argument "table" is expected to be a table of +// multiples of the point P in Montgomery-affine form, with each block +// corresponding to "blocksize" bits of the scalar as follows, where +// B = 2^{blocksize-1} (e.g. B = 8 for blocksize = 4): +// +// For each i,j with blocksize * i <= 256 and 1 <= j <= B +// the multiple 2^{blocksize * i} * j * P is stored at +// tab[8 * (B * i + (j - 1))], considered as uint64_t pointers +// or tab + 64 * (B * i + (j - 1)) as byte pointers. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = blocksize, X3 = table +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmulbase_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmulbase_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Safe copies of inputs and additional variables, with some aliasing + +#define res x19 +#define blocksize x20 +#define table x21 +#define i x22 +#define bf x23 +#define cf x24 +#define j x25 + +// Intermediate variables on the stack. The last z2, z3 values can +// safely be overlaid on "nacc", which is no longer needed at the end. +// Uppercase syntactic variants make x86_att version simpler to generate + +#define rscalar sp, #(0*NUMSIZE) +#define acc sp, #(1*NUMSIZE) +#define nacc sp, #(4*NUMSIZE) +#define tabent sp, #(7*NUMSIZE) + +#define z2 sp, #(4*NUMSIZE) +#define z3 sp, #(5*NUMSIZE) + +#define NSPACE #(9*NUMSIZE) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(p256_scalarmulbase_alt): + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x30, [sp, #-16]! + sub sp, sp, NSPACE + +// Preserve the input arguments except the scalar, since that gets absorbed +// immediately. The "table" value subsequently gets shifted up each iteration +// of the loop, while "res" and "blocksize" are static throughout. + + mov res, x0 + mov blocksize, x2 + mov table, x3 + +// Load the digits of group order n_256 = [x15;x14;x13;x12] + + movbig(x12, #0xf3b9, #0xcac2, #0xfc63, #0x2551) + movbig(x13, #0xbce6, #0xfaad, #0xa717, #0x9e84) + mov x14, #0xffffffffffffffff + mov x15, #0xffffffff00000000 + +// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256 +// Store it to "rscalar" (reduced scalar) + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + + subs x6, x2, x12 + sbcs x7, x3, x13 + sbcs x8, x4, x14 + sbcs x9, x5, x15 + + csel x2, x2, x6, cc + csel x3, x3, x7, cc + csel x4, x4, x8, cc + csel x5, x5, x9, cc + + stp x2, x3, [rscalar] + stp x4, x5, [rscalar+16] + +// Initialize the accumulator to all zeros and the "carry flag" cf to 0 + + stp xzr, xzr, [acc] + stp xzr, xzr, [acc+16] + stp xzr, xzr, [acc+32] + stp xzr, xzr, [acc+48] + stp xzr, xzr, [acc+64] + stp xzr, xzr, [acc+80] + mov cf, xzr + +// Main loop over {i >= 0 | blocksize * i <= 256}. Note the non-strict +// inequality, to allow top carry for any choices of blocksize. + + mov i, xzr + +p256_scalarmulbase_alt_loop: + +// The next raw bitfield is bf = bitfield(blocksize * i,blocksize) + cf, +// adding in the deferred carry cf. We then shift the whole scalar right +// by blocksize so we can keep picking bitfield(0,blocksize). + + ldp x0, x1, [rscalar] + ldp x2, x3, [rscalar+16] + + mov x4, #1 + lsl x4, x4, blocksize + sub x4, x4, #1 + and x4, x4, x0 + add bf, x4, cf + + neg x8, blocksize + + lsl x5, x1, x8 + + lsr x0, x0, blocksize + orr x0, x0, x5 + + lsl x6, x2, x8 + lsr x1, x1, blocksize + orr x1, x1, x6 + + lsl x7, x3, x8 + lsr x2, x2, blocksize + orr x2, x2, x7 + + lsr x3, x3, blocksize + + stp x0, x1, [rscalar] + stp x2, x3, [rscalar+16] + +// Now if bf <= B we just select entry j, unnegated and set cf = 0. +// If bf > B we set j = 2 * B - bf and negate the j'th entry, setting cf = 1. +// In either case we ultimately add bf, in the latter case with deferred +// carry as 2 * B - (2 * B - bf) = bf. + + mov x0, #1 + lsl x1, x0, blocksize + lsr x0, x1, #1 + + sub x2, x1, bf + + cmp x0, bf + cset cf, cc + csel j, x2, bf, cc + +// Load table entry j - 1 for nonzero j in constant-time style. + + mov x16, #1 + lsl x16, x16, blocksize + lsr x16, x16, #1 + mov x17, j + +p256_scalarmulbase_alt_tabloop: + ldp x8, x9, [table] + ldp x10, x11, [table, #16] + ldp x12, x13, [table, #32] + ldp x14, x15, [table, #48] + + subs x17, x17, #1 + csel x0, x8, x0, eq + csel x1, x9, x1, eq + csel x2, x10, x2, eq + csel x3, x11, x3, eq + csel x4, x12, x4, eq + csel x5, x13, x5, eq + csel x6, x14, x6, eq + csel x7, x15, x7, eq + + add table, table, #64 + + sub x16, x16, #1 + cbnz x16, p256_scalarmulbase_alt_tabloop + +// Before storing back, optionally negate the y coordinate of the table entry + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + + mov x0, 0xffffffffffffffff + subs x0, x0, x4 + mov x1, 0x00000000ffffffff + sbcs x1, x1, x5 + mov x3, 0xffffffff00000001 + sbcs x2, xzr, x6 + sbc x3, x3, x7 + + cmp cf, xzr + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + +// Add the adjusted table point to the accumulator + + add x0, nacc + add x1, acc + add x2, tabent + bl p256_scalarmulbase_alt_local_p256_montjmixadd + +// However, only commit that update to the accumulator if j is nonzero, +// because the mixed addition function does not handle this case directly, +// and in any case we didn't choose the table entry appropriately. + + cmp j, xzr + ldp x0, x1, [acc] + ldp x12, x13, [nacc] + csel x0, x12, x0, ne + csel x1, x13, x1, ne + + ldp x2, x3, [acc+16] + ldp x12, x13, [nacc+16] + csel x2, x12, x2, ne + csel x3, x13, x3, ne + + ldp x4, x5, [acc+32] + ldp x12, x13, [nacc+32] + csel x4, x12, x4, ne + csel x5, x13, x5, ne + + ldp x6, x7, [acc+48] + ldp x12, x13, [nacc+48] + csel x6, x12, x6, ne + csel x7, x13, x7, ne + + ldp x8, x9, [acc+64] + ldp x12, x13, [nacc+64] + csel x8, x12, x8, ne + csel x9, x13, x9, ne + + ldp x10, x11, [acc+80] + ldp x12, x13, [nacc+80] + csel x10, x12, x10, ne + csel x11, x13, x11, ne + + stp x0, x1, [acc] + stp x2, x3, [acc+16] + stp x4, x5, [acc+32] + stp x6, x7, [acc+48] + stp x8, x9, [acc+64] + stp x10, x11, [acc+80] + +// Loop while blocksize * i <= 256 + + add i, i, #1 + mul x0, blocksize, i + cmp x0, #257 + bcc p256_scalarmulbase_alt_loop + +// That's the end of the main loop, and we just need to translate +// back from the Jacobian representation to affine. First of all, +// let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form + + add x0, z2 + add x1, acc+64 + bl p256_scalarmulbase_alt_local_montsqr_p256 + + add x0, z3 + add x1, acc+64 + add x2, z2 + bl p256_scalarmulbase_alt_local_montmul_p256 + + add x0, z2 + add x1, z3 + bl p256_scalarmulbase_alt_local_demont_p256 + + add x0, z3 + add x1, z2 + bl p256_scalarmulbase_alt_local_inv_p256 + + add x0, z2 + add x1, acc+64 + add x2, z3 + bl p256_scalarmulbase_alt_local_montmul_p256 + +// Convert back from Jacobian (X,Y,Z) |-> (X/Z^2, Y/Z^3) + + mov x0, res + add x1, acc + add x2, z2 + bl p256_scalarmulbase_alt_local_montmul_p256 + + add x0, res, #32 + add x1, acc+32 + add x2, z3 + bl p256_scalarmulbase_alt_local_montmul_p256 + +// Restore stack and registers and return + + add sp, sp, NSPACE + ldp x25, x30, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// Local copies of subroutines, complete clones at the moment + +p256_scalarmulbase_alt_local_demont_p256: + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + lsl x7, x2, #32 + subs x8, x2, x7 + lsr x6, x2, #32 + sbc x2, x2, x6 + adds x3, x3, x7 + adcs x4, x4, x6 + adcs x5, x5, x8 + adc x2, x2, xzr + lsl x7, x3, #32 + subs x8, x3, x7 + lsr x6, x3, #32 + sbc x3, x3, x6 + adds x4, x4, x7 + adcs x5, x5, x6 + adcs x2, x2, x8 + adc x3, x3, xzr + lsl x7, x4, #32 + subs x8, x4, x7 + lsr x6, x4, #32 + sbc x4, x4, x6 + adds x5, x5, x7 + adcs x2, x2, x6 + adcs x3, x3, x8 + adc x4, x4, xzr + lsl x7, x5, #32 + subs x8, x5, x7 + lsr x6, x5, #32 + sbc x5, x5, x6 + adds x2, x2, x7 + adcs x3, x3, x6 + adcs x4, x4, x8 + adc x5, x5, xzr + stp x2, x3, [x0] + stp x4, x5, [x0, #16] + ret + +p256_scalarmulbase_alt_local_inv_p256: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + sub sp, sp, #0xa0 + mov x20, x0 + mov x10, #0xffffffffffffffff + mov x11, #0xffffffff + mov x13, #0xffffffff00000001 + stp x10, x11, [sp] + stp xzr, x13, [sp, #16] + str xzr, [sp, #32] + ldp x2, x3, [x1] + subs x10, x2, x10 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #16] + sbcs x12, x4, xzr + sbcs x13, x5, x13 + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + stp x2, x3, [sp, #48] + stp x4, x5, [sp, #64] + str xzr, [sp, #80] + stp xzr, xzr, [sp, #96] + stp xzr, xzr, [sp, #112] + mov x10, #0x4000000000000 + stp x10, xzr, [sp, #128] + stp xzr, xzr, [sp, #144] + mov x21, #0xa + mov x22, #0x1 + b p256_scalarmulbase_alt_inv_midloop +p256_scalarmulbase_alt_inv_loop: + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + and x0, x12, x16 + and x1, x13, x17 + add x19, x0, x1 + ldr x7, [sp] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #48] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x7, [sp, #8] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #56] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [sp] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [sp, #48] + ldr x7, [sp, #16] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #64] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [sp, #8] + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [sp, #56] + ldr x7, [sp, #24] + eor x1, x7, x14 + ldr x23, [sp, #32] + eor x3, x23, x14 + and x3, x3, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #72] + eor x1, x8, x15 + ldr x24, [sp, #80] + eor x0, x24, x15 + and x0, x0, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [sp, #16] + extr x5, x3, x5, #59 + str x5, [sp, #24] + asr x3, x3, #59 + str x3, [sp, #32] + eor x1, x7, x16 + eor x5, x23, x16 + and x5, x5, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + eor x0, x24, x17 + and x0, x0, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [sp, #64] + extr x2, x5, x2, #59 + str x2, [sp, #72] + asr x5, x5, #59 + str x5, [sp, #80] + ldr x7, [sp, #96] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #128] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #96] + adc x2, x2, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x5, x19, x0 + adc x3, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x5, x5, x0 + str x5, [sp, #128] + adc x3, x3, x1 + ldr x7, [sp, #104] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #136] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #104] + adc x6, x6, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x3, x3, x0 + str x3, [sp, #136] + adc x4, x4, x1 + ldr x7, [sp, #112] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #144] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #112] + adc x5, x5, x1 + eor x1, x7, x16 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, x17 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x4, x4, x0 + str x4, [sp, #144] + adc x2, x2, x1 + ldr x7, [sp, #120] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #152] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + ldp x0, x1, [sp, #96] + ldr x6, [sp, #112] + mov x14, #0xe000000000000000 + adds x0, x0, x14 + sbcs x1, x1, xzr + mov x11, #0x1fffffff + adcs x6, x6, x11 + mov x10, #0x2000000000000000 + adcs x5, x5, x10 + mov x14, #0x1fffffffe0000000 + adc x3, x3, x14 + lsl x11, x0, #32 + subs x14, x0, x11 + lsr x10, x0, #32 + sbc x0, x0, x10 + adds x1, x1, x11 + adcs x6, x6, x10 + adcs x5, x5, x14 + adcs x3, x3, x0 + mov x14, #0xffffffffffffffff + mov x11, #0xffffffff + mov x10, #0xffffffff00000001 + csel x14, x14, xzr, cs + csel x11, x11, xzr, cs + csel x10, x10, xzr, cs + subs x1, x1, x14 + sbcs x6, x6, x11 + sbcs x5, x5, xzr + sbc x3, x3, x10 + stp x1, x6, [sp, #96] + stp x5, x3, [sp, #112] + eor x1, x7, x16 + and x5, x16, x12 + neg x5, x5 + mul x0, x1, x12 + umulh x1, x1, x12 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, x17 + and x0, x17, x13 + sub x5, x5, x0 + mul x0, x1, x13 + umulh x1, x1, x13 + adds x2, x2, x0 + adc x5, x5, x1 + ldp x0, x1, [sp, #128] + ldr x3, [sp, #144] + mov x14, #0xe000000000000000 + adds x0, x0, x14 + sbcs x1, x1, xzr + mov x11, #0x1fffffff + adcs x3, x3, x11 + mov x10, #0x2000000000000000 + adcs x2, x2, x10 + mov x14, #0x1fffffffe0000000 + adc x5, x5, x14 + lsl x11, x0, #32 + subs x14, x0, x11 + lsr x10, x0, #32 + sbc x0, x0, x10 + adds x1, x1, x11 + adcs x3, x3, x10 + adcs x2, x2, x14 + adcs x5, x5, x0 + mov x14, #0xffffffffffffffff + mov x11, #0xffffffff + mov x10, #0xffffffff00000001 + csel x14, x14, xzr, cs + csel x11, x11, xzr, cs + csel x10, x10, xzr, cs + subs x1, x1, x14 + sbcs x3, x3, x11 + sbcs x2, x2, xzr + sbc x5, x5, x10 + stp x1, x3, [sp, #128] + stp x2, x5, [sp, #144] +p256_scalarmulbase_alt_inv_midloop: + mov x1, x22 + ldr x2, [sp] + ldr x3, [sp, #48] + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x8, x4, #0x100, lsl #12 + sbfx x8, x8, #21, #21 + mov x11, #0x100000 + add x11, x11, x11, lsl #21 + add x9, x4, x11 + asr x9, x9, #42 + add x10, x5, #0x100, lsl #12 + sbfx x10, x10, #21, #21 + add x11, x5, x11 + asr x11, x11, #42 + mul x6, x8, x2 + mul x7, x9, x3 + mul x2, x10, x2 + mul x3, x11, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #21, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #42 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #21, #21 + add x15, x5, x15 + asr x15, x15, #42 + mul x6, x12, x2 + mul x7, x13, x3 + mul x2, x14, x2 + mul x3, x15, x3 + add x4, x6, x7 + add x5, x2, x3 + asr x2, x4, #20 + asr x3, x5, #20 + and x4, x2, #0xfffff + orr x4, x4, #0xfffffe0000000000 + and x5, x3, #0xfffff + orr x5, x5, #0xc000000000000000 + tst x5, #0x1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + mul x2, x12, x8 + mul x3, x12, x9 + mul x6, x14, x8 + mul x7, x14, x9 + madd x8, x13, x10, x2 + madd x9, x13, x11, x3 + madd x16, x15, x10, x6 + madd x17, x15, x11, x7 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + tst x5, #0x2 + asr x5, x5, #1 + csel x6, x4, xzr, ne + ccmp x1, xzr, #0x8, ne + cneg x1, x1, ge + cneg x6, x6, ge + csel x4, x5, x4, ge + add x5, x5, x6 + add x1, x1, #0x2 + asr x5, x5, #1 + add x12, x4, #0x100, lsl #12 + sbfx x12, x12, #22, #21 + mov x15, #0x100000 + add x15, x15, x15, lsl #21 + add x13, x4, x15 + asr x13, x13, #43 + add x14, x5, #0x100, lsl #12 + sbfx x14, x14, #22, #21 + add x15, x5, x15 + asr x15, x15, #43 + mneg x2, x12, x8 + mneg x3, x12, x9 + mneg x4, x14, x8 + mneg x5, x14, x9 + msub x10, x13, x16, x2 + msub x11, x13, x17, x3 + msub x12, x15, x16, x4 + msub x13, x15, x17, x5 + mov x22, x1 + subs x21, x21, #0x1 + bne p256_scalarmulbase_alt_inv_loop + ldr x0, [sp] + ldr x1, [sp, #48] + mul x0, x0, x10 + madd x1, x1, x11, x0 + asr x0, x1, #63 + cmp x10, xzr + csetm x14, mi + cneg x10, x10, mi + eor x14, x14, x0 + cmp x11, xzr + csetm x15, mi + cneg x11, x11, mi + eor x15, x15, x0 + cmp x12, xzr + csetm x16, mi + cneg x12, x12, mi + eor x16, x16, x0 + cmp x13, xzr + csetm x17, mi + cneg x13, x13, mi + eor x17, x17, x0 + and x0, x10, x14 + and x1, x11, x15 + add x9, x0, x1 + ldr x7, [sp, #96] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x4, x9, x0 + adc x2, xzr, x1 + ldr x8, [sp, #128] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x4, x4, x0 + str x4, [sp, #96] + adc x2, x2, x1 + ldr x7, [sp, #104] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [sp, #136] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x2, x2, x0 + str x2, [sp, #104] + adc x6, x6, x1 + ldr x7, [sp, #112] + eor x1, x7, x14 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [sp, #144] + eor x1, x8, x15 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x6, x6, x0 + str x6, [sp, #112] + adc x5, x5, x1 + ldr x7, [sp, #120] + eor x1, x7, x14 + and x3, x14, x10 + neg x3, x3 + mul x0, x1, x10 + umulh x1, x1, x10 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [sp, #152] + eor x1, x8, x15 + and x0, x15, x11 + sub x3, x3, x0 + mul x0, x1, x11 + umulh x1, x1, x11 + adds x5, x5, x0 + adc x3, x3, x1 + ldp x0, x1, [sp, #96] + ldr x2, [sp, #112] + mov x14, #0xe000000000000000 + adds x0, x0, x14 + sbcs x1, x1, xzr + mov x11, #0x1fffffff + adcs x2, x2, x11 + mov x10, #0x2000000000000000 + adcs x5, x5, x10 + mov x14, #0x1fffffffe0000000 + adc x3, x3, x14 + lsl x11, x0, #32 + subs x14, x0, x11 + lsr x10, x0, #32 + sbc x0, x0, x10 + adds x1, x1, x11 + adcs x2, x2, x10 + adcs x5, x5, x14 + adcs x3, x3, x0 + mov x14, #0xffffffffffffffff + mov x11, #0xffffffff + mov x10, #0xffffffff00000001 + csel x14, x14, xzr, cs + csel x11, x11, xzr, cs + csel x10, x10, xzr, cs + subs x1, x1, x14 + sbcs x2, x2, x11 + sbcs x5, x5, xzr + sbc x3, x3, x10 + mov x10, #0xffffffffffffffff + subs x10, x1, x10 + mov x11, #0xffffffff + sbcs x11, x2, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x5, xzr + sbcs x13, x3, x13 + csel x10, x1, x10, cc + csel x11, x2, x11, cc + csel x12, x5, x12, cc + csel x13, x3, x13, cc + stp x10, x11, [x20] + stp x12, x13, [x20, #16] + add sp, sp, #0xa0 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +p256_scalarmulbase_alt_local_montmul_p256: + ldp x3, x4, [x1] + ldp x7, x8, [x2] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x2, #16] + mul x11, x3, x9 + umulh x15, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x16, x3, x10 + adcs x15, x15, x11 + adc x16, x16, xzr + ldp x5, x6, [x1, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x15, x15, x11 + mul x11, x4, x10 + adcs x16, x16, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x15, x15, x11 + umulh x11, x4, x9 + adcs x16, x16, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x15, x15, x11 + mul x11, x5, x9 + adcs x16, x16, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x15, x15, x11 + umulh x11, x5, x8 + adcs x16, x16, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x15, x15, x11 + mul x11, x6, x8 + adcs x16, x16, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x15, x15, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x16, x16, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x15, x15, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x15, x15, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x15, lsl #32 + lsr x11, x15, #32 + adcs x13, x13, x11 + mul x11, x15, x10 + umulh x15, x15, x10 + adcs x14, x14, x11 + adc x15, x15, xzr + adds x12, x12, x16 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x15, x15, x5 + cset x8, cs + mov x11, #0xffffffff + adds x16, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x15, x10 + sbcs xzr, x8, xzr + csel x12, x12, x16, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x15, x15, x5, cc + stp x12, x13, [x0] + stp x14, x15, [x0, #16] + ret + +p256_scalarmulbase_alt_local_montsqr_p256: + ldp x2, x3, [x1] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x1, #16] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + mov x5, #0xffffffff00000001 + adds x9, x9, x8, lsl #32 + lsr x2, x8, #32 + adcs x10, x10, x2 + mul x2, x8, x5 + umulh x8, x8, x5 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x2, x9, #32 + adcs x11, x11, x2 + mul x2, x9, x5 + umulh x9, x9, x5 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x2, x10, #32 + adcs x8, x8, x2 + mul x2, x10, x5 + umulh x10, x10, x5 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x2, x11, #32 + adcs x9, x9, x2 + mul x2, x11, x5 + umulh x11, x11, x5 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [x0] + stp x10, x11, [x0, #16] + ret + +p256_scalarmulbase_alt_local_p256_montjmixadd: + sub sp, sp, #0xc0 + mov x15, x0 + mov x16, x1 + mov x17, x2 + ldp x2, x3, [x16, #64] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x16, #80] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + mov x2, #0xffffffffffffffff + csel x2, xzr, x2, cc + mov x3, #0xffffffff + csel x3, xzr, x3, cc + mov x5, #0xffffffff00000001 + csel x5, xzr, x5, cc + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, xzr + sbc x11, x11, x5 + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + ldp x3, x4, [x16, #64] + ldp x7, x8, [x17, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [x16, #80] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #32] + stp x14, x0, [sp, #48] + ldp x3, x4, [sp] + ldp x7, x8, [x17] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #16] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #64] + stp x14, x0, [sp, #80] + ldp x3, x4, [sp] + ldp x7, x8, [sp, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #32] + stp x14, x0, [sp, #48] + ldp x5, x6, [sp, #64] + ldp x4, x3, [x16] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [x16, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #160] + stp x7, x8, [sp, #176] + ldp x5, x6, [sp, #32] + ldp x4, x3, [x16, #32] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #48] + ldp x4, x3, [x16, #48] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #32] + stp x7, x8, [sp, #48] + ldp x2, x3, [sp, #160] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #176] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + mov x2, #0xffffffffffffffff + csel x2, xzr, x2, cc + mov x3, #0xffffffff + csel x3, xzr, x3, cc + mov x5, #0xffffffff00000001 + csel x5, xzr, x5, cc + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, xzr + sbc x11, x11, x5 + stp x8, x9, [sp, #96] + stp x10, x11, [sp, #112] + ldp x2, x3, [sp, #32] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #48] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + mov x5, #0xffffffff00000001 + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + ldp x3, x4, [sp, #96] + ldp x7, x8, [x16] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #16] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #112] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #128] + stp x14, x0, [sp, #144] + ldp x3, x4, [sp, #96] + ldp x7, x8, [sp, #64] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #80] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #112] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #64] + stp x14, x0, [sp, #80] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #16] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #16] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #96] + stp x7, x8, [sp, #112] + ldp x3, x4, [sp, #160] + ldp x7, x8, [x16, #64] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #80] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #176] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #160] + stp x14, x0, [sp, #176] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #64] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #16] + ldp x4, x3, [sp, #80] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #16] + ldp x5, x6, [sp, #128] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #144] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldp x3, x4, [sp, #96] + ldp x7, x8, [x16, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #112] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #96] + stp x14, x0, [sp, #112] + ldp x3, x4, [sp, #32] + ldp x7, x8, [sp, #128] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #144] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #48] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #128] + stp x14, x0, [sp, #144] + ldp x5, x6, [sp, #128] + ldp x4, x3, [sp, #96] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #144] + ldp x4, x3, [sp, #112] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldp x0, x1, [x16, #64] + ldp x2, x3, [x16, #80] + orr x4, x0, x1 + orr x5, x2, x3 + orr x4, x4, x5 + cmp x4, xzr + ldp x0, x1, [sp] + ldp x12, x13, [x17] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x2, x3, [sp, #16] + ldp x12, x13, [x17, #16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + ldp x4, x5, [sp, #128] + ldp x12, x13, [x17, #32] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x6, x7, [sp, #144] + ldp x12, x13, [x17, #48] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + ldp x8, x9, [sp, #160] + mov x12, #0x1 + mov x13, #0xffffffff00000000 + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x10, x11, [sp, #176] + mov x12, #0xffffffffffffffff + mov x13, #0xfffffffe + csel x10, x10, x12, ne + csel x11, x11, x13, ne + stp x0, x1, [x15] + stp x2, x3, [x15, #16] + stp x4, x5, [x15, #32] + stp x6, x7, [x15, #48] + stp x8, x9, [x15, #64] + stp x10, x11, [x15, #80] + add sp, sp, #0xc0 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/README.md b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/README.md new file mode 100644 index 00000000000..fa63f949fd1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/README.md @@ -0,0 +1,9 @@ +This directory contains Arm implementations that are functional but slower +than the implementations with the same file names. The implementations in the +parent directory are mechanically/manually optimized versions of this +directory, meaning that their high-level algorithms are unchanged but the +implementation details are updated. + +These functions will only be compiled when running HOL Light proofs using +`make proofs` because HOL Light proofs use these to keep proofs simple. +The compiled object files will not be included in libs2nbignum. diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/bignum_montmul_p256_base.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/bignum_montmul_p256_base.S new file mode 100644 index 00000000000..b4dd6087fdd --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/bignum_montmul_p256_base.S @@ -0,0 +1,275 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_256 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_p256_base +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_256, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_256 (in particular this is true if we are in +// the "usual" case x < p_256 and y < p_256). +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256_base) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256_base) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z) +// c,h,l,t should all be different +// t,h should not overlap w,z +// --------------------------------------------------------------------------- + +#define muldiffn(c,h,l, t, x,y, w,z) \ + subs t, x, y __LF \ + cneg t, t, cc __LF \ + csetm c, cc __LF \ + subs h, w, z __LF \ + cneg h, h, cc __LF \ + mul l, t, h __LF \ + umulh h, t, h __LF \ + cinv c, c, cc __LF \ + eor l, l, c __LF \ + eor h, h, c + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1] and generating d4 from zero, re-using +// d0 as a temporary internally together with t0, t1 and t2. +// It is fine for d4 to be the same register as d0, and it often is. +// --------------------------------------------------------------------------- + +#define montreds(d4,d3,d2,d1,d0, t2,t1,t0) \ +/* Let w = d0, the original word we use as offset; d0 gets recycled */ \ +/* First let [t2;t1] = 2^32 * w */ \ +/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0) */ \ + lsl t1, d0, #32 __LF \ + subs t0, d0, t1 __LF \ + lsr t2, d0, #32 __LF \ + sbc d0, d0, t2 __LF \ +/* Hence [d4;..;d1] := [d3;d2;d1;0] + (2^256 - 2^224 + 2^192 + 2^96) * w */ \ + adds d1, d1, t1 __LF \ + adcs d2, d2, t2 __LF \ + adcs d3, d3, t0 __LF \ + adc d4, d0, xzr + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 +#define b0 x7 +#define b1 x8 +#define b2 x9 +#define b3 x10 + +#define s0 x11 +#define s1 x12 +#define s2 x13 +#define s3 x14 +#define t0 x15 +#define t1 x16 +#define t2 x17 +#define t3 x1 +#define s4 x2 + +S2N_BN_SYMBOL(bignum_montmul_p256_base): + +// Load in all words of both inputs + + ldp a0, a1, [x1] + ldp a2, a3, [x1, #16] + ldp b0, b1, [x2] + ldp b2, b3, [x2, #16] + +// Multiply low halves with a 2x2->4 ADK multiplier as L = [s3;s2;s1;s0] + + mul s0, a0, b0 + mul s2, a1, b1 + umulh s1, a0, b0 + adds t1, s0, s2 + umulh s3, a1, b1 + adcs t2, s1, s3 + adcs s3, s3, xzr + adds s1, s1, t1 + adcs s2, s2, t2 + adcs s3, s3, xzr + muldiffn(t3,t2,t1, t0, a0,a1, b1,b0) + adds xzr, t3, #1 + adcs s1, s1, t1 + adcs s2, s2, t2 + adc s3, s3, t3 + +// Perform two "short" Montgomery steps on the low product to +// get a modified low result L' = [s1;s0;s3;s2] +// This shifts it to an offset compatible with middle terms +// Stash the result L' temporarily in the output buffer to avoid +// using additional registers. + + montreds(s0,s3,s2,s1,s0, t1,t2,t3) + montreds(s1,s0,s3,s2,s1, t1,t2,t3) + + stp s2, s3, [x0] + stp s0, s1, [x0, #16] + +// Multiply high halves with a 2x2->4 ADK multiplier as H = [s3;s2;s1;s0] + + mul s0, a2, b2 + mul s2, a3, b3 + umulh s1, a2, b2 + adds t1, s0, s2 + umulh s3, a3, b3 + adcs t2, s1, s3 + adcs s3, s3, xzr + adds s1, s1, t1 + adcs s2, s2, t2 + adcs s3, s3, xzr + muldiffn(t3,t2,t1, t0, a2,a3, b3,b2) + adds xzr, t3, #1 + adcs s1, s1, t1 + adcs s2, s2, t2 + adc s3, s3, t3 + +// Compute sign-magnitude a2,[a1,a0] = x_hi - x_lo + + subs a0, a2, a0 + sbcs a1, a3, a1 + sbc a2, xzr, xzr + adds xzr, a2, #1 + eor a0, a0, a2 + adcs a0, a0, xzr + eor a1, a1, a2 + adcs a1, a1, xzr + +// Compute sign-magnitude b2,[b1,b0] = y_lo - y_hi + + subs b0, b0, b2 + sbcs b1, b1, b3 + sbc b2, xzr, xzr + adds xzr, b2, #1 + eor b0, b0, b2 + adcs b0, b0, xzr + eor b1, b1, b2 + adcs b1, b1, xzr + +// Save the correct sign for the sub-product in b3 + + eor b3, a2, b2 + +// Add the high H to the modified low term L' as H + L' = [s4;b2;a2;t3;t0] + + ldp t0, t3, [x0] + adds t0, s0, t0 + adcs t3, s1, t3 + ldp a2, b2, [x0, #16] + adcs a2, s2, a2 + adcs b2, s3, b2 + adc s4, xzr, xzr + +// Multiply with yet a third 2x2->4 ADK multiplier for complex mid-term M + + mul s0, a0, b0 + mul s2, a1, b1 + umulh s1, a0, b0 + adds t1, s0, s2 + umulh s3, a1, b1 + adcs t2, s1, s3 + adcs s3, s3, xzr + adds s1, s1, t1 + adcs s2, s2, t2 + adcs s3, s3, xzr + muldiffn(a1,t2,t1, a0, a0,a1, b1,b0) + adds xzr, a1, #1 + adcs s1, s1, t1 + adcs s2, s2, t2 + adc s3, s3, a1 + +// Set up a sign-modified version of the mid-product in a long accumulator +// as [b3;a1;a0;s3;s2;s1;s0], adding in the H + L' term once with +// zero offset as this signed value is created + + adds xzr, b3, #1 + eor s0, s0, b3 + adcs s0, s0, t0 + eor s1, s1, b3 + adcs s1, s1, t3 + eor s2, s2, b3 + adcs s2, s2, a2 + eor s3, s3, b3 + adcs s3, s3, b2 + adcs a0, s4, b3 + adcs a1, b3, xzr + adc b3, b3, xzr + +// Add in the stashed H + L' term an offset of 2 words as well + + adds s2, s2, t0 + adcs s3, s3, t3 + adcs a0, a0, a2 + adcs a1, a1, b2 + adc b3, b3, s4 + +// Do two more Montgomery steps on the composed term +// Net pre-reduct is in [b3;a1;a0;s3;s2] + + montreds(s0,s3,s2,s1,s0, t1,t2,t3) + montreds(s1,s0,s3,s2,s1, t1,t2,t3) + + adds a0, a0, s0 + adcs a1, a1, s1 + adc b3, b3, xzr + +// Because of the way we added L' in two places, we can overspill by +// more than usual in Montgomery, with the result being only known to +// be < 3 * p_256, not the usual < 2 * p_256. So now we do a more +// elaborate final correction in the style of bignum_cmul_p256, though +// we can use much simpler quotient estimation logic (q = h + 1) and +// slightly more direct accumulation of p_256 * q. + +#define d0 s2 +#define d1 s3 +#define d2 a0 +#define d3 a1 +#define h b3 + +#define q s4 +#define c b0 + + add q, h, #1 + lsl t1, q, #32 + + adds d3, d3, t1 + adc h, h, xzr + sub t0, xzr, q + sub t1, t1, #1 + subs d0, d0, t0 + sbcs d1, d1, t1 + sbcs d2, d2, xzr + sbcs d3, d3, q + sbcs c, h, q + adds d0, d0, c + mov h, #0x00000000ffffffff + and h, h, c + adcs d1, d1, h + adcs d2, d2, xzr + mov h, #0xffffffff00000001 + and h, h, c + adc d3, d3, h + +// Finally store the result + + stp d0, d1, [x0] + stp d2, d3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/bignum_montsqr_p256_base.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/bignum_montsqr_p256_base.S new file mode 100644 index 00000000000..d6d0a9ebde9 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/bignum_montsqr_p256_base.S @@ -0,0 +1,266 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_p256_base +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_256, assuming x^2 <= 2^256 * p_256, which is +// guaranteed in particular if x < p_256 initially (the "intended" case). +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256_base) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256_base) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z) +// c,h,l,t should all be different +// t,h should not overlap w,z +// --------------------------------------------------------------------------- + +#define muldiffn(c,h,l, t, x,y, w,z) \ + subs t, x, y __LF \ + cneg t, t, cc __LF \ + csetm c, cc __LF \ + subs h, w, z __LF \ + cneg h, h, cc __LF \ + mul l, t, h __LF \ + umulh h, t, h __LF \ + cinv c, c, cc __LF \ + eor l, l, c __LF \ + eor h, h, c + +// --------------------------------------------------------------------------- +// Core one-step "end" Montgomery reduction macro. Takes input in +// [d5;d4;d3;d2;d1;d0] and returns result in [d5;d4;d3;d2;d1], adding to +// the existing [d4;d3;d2;d1], re-using d0 as a temporary internally as well +// as t1, t2, t3, and initializing d5 from zero (hence "end"). +// --------------------------------------------------------------------------- + +#define montrede(d5, d4,d3,d2,d1,d0, t2,t1,t0) \ +/* Let w = d0, the original word we use as offset; d0 gets recycled */ \ +/* First let [t2;t1] = 2^32 * w */ \ +/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0) */ \ + lsl t1, d0, #32 __LF \ + subs t0, d0, t1 __LF \ + lsr t2, d0, #32 __LF \ + sbc d0, d0, t2 __LF \ +/* Hence basic [d4;d3;d2;d1] += (2^256 - 2^224 + 2^192 + 2^96) * w */ \ + adds d1, d1, t1 __LF \ + adcs d2, d2, t2 __LF \ + adcs d3, d3, t0 __LF \ + adcs d4, d4, d0 __LF \ + adc d5, xzr, xzr + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1] and generating d4 from zero, re-using +// d0 as a temporary internally together with t0, t1 and t2. +// It is fine for d4 to be the same register as d0, and it often is. +// --------------------------------------------------------------------------- + +#define montreds(d4,d3,d2,d1,d0, t2,t1,t0) \ +/* Let w = d0, the original word we use as offset; d0 gets recycled */ \ +/* First let [t2;t1] = 2^32 * w */ \ +/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0) */ \ + lsl t1, d0, #32 __LF \ + subs t0, d0, t1 __LF \ + lsr t2, d0, #32 __LF \ + sbc d0, d0, t2 __LF \ +/* Hence [d4;..;d1] := [d3;d2;d1;0] + (2^256 - 2^224 + 2^192 + 2^96) * w */ \ + adds d1, d1, t1 __LF \ + adcs d2, d2, t2 __LF \ + adcs d3, d3, t0 __LF \ + adc d4, d0, xzr + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 + +#define c0 x6 +#define c1 x7 +#define c2 x8 +#define c3 x9 +#define c4 x10 +#define d1 x11 +#define d2 x12 +#define d3 x13 +#define d4 x14 + +#define s0 x15 +#define s1 x16 +#define s2 x17 +#define s3 x1 + +#define a0short w2 +#define a1short w3 +#define d1short w11 + +S2N_BN_SYMBOL(bignum_montsqr_p256_base): + +// Load in all words of the input + + ldp a0, a1, [x1] + ldp a2, a3, [x1, #16] + +// Square the low half, getting a result in [s3;s2;s1;s0] +// This uses 32x32->64 multiplications to reduce the number of UMULHs + + umull s0, a0short, a0short + lsr d1, a0, #32 + umull s1, d1short, d1short + umull d1, a0short, d1short + adds s0, s0, d1, lsl #33 + lsr d1, d1, #31 + adc s1, s1, d1 + umull s2, a1short, a1short + lsr d1, a1, #32 + umull s3, d1short, d1short + umull d1, a1short, d1short + mul d2, a0, a1 + umulh d3, a0, a1 + adds s2, s2, d1, lsl #33 + lsr d1, d1, #31 + adc s3, s3, d1 + adds d2, d2, d2 + adcs d3, d3, d3 + adc s3, s3, xzr + adds s1, s1, d2 + adcs s2, s2, d3 + adc s3, s3, xzr + +// Perform two "short" Montgomery steps on the low square +// This shifts it to an offset compatible with middle product + + montreds(s0,s3,s2,s1,s0, d1,d2,d3) + + montreds(s1,s0,s3,s2,s1, d1,d2,d3) + +// Compute cross-product with ADK 2x2->4 multiplier as [c3;c2;c1;c0] + + mul c0, a0, a2 + mul d4, a1, a3 + umulh c2, a0, a2 + muldiffn(d3,d2,d1, c4, a0,a1, a3,a2) + + adds c1, c0, c2 + adc c2, c2, xzr + + umulh c3, a1, a3 + + adds c1, c1, d4 + adcs c2, c2, c3 + adc c3, c3, xzr + adds c2, c2, d4 + adc c3, c3, xzr + + adds xzr, d3, #1 + adcs c1, c1, d1 + adcs c2, c2, d2 + adc c3, c3, d3 + +// Double it and add the Montgomerified low square + + adds c0, c0, c0 + adcs c1, c1, c1 + adcs c2, c2, c2 + adcs c3, c3, c3 + adc c4, xzr, xzr + + adds c0, c0, s2 + adcs c1, c1, s3 + adcs c2, c2, s0 + adcs c3, c3, s1 + adc c4, c4, xzr + +// Montgomery-reduce the combined low and middle term another twice + + montrede(c0,c4,c3,c2,c1,c0, d1,d2,d3) + + montrede(c1,c0,c4,c3,c2,c1, d1,d2,d3) + +// Our sum so far is in [c1,c0,c4,c3,c2]; choose more intuitive names + +#define r0 x8 +#define r1 x9 +#define r2 x10 +#define r3 x6 +#define c x7 + +// So we can have these as temps + +#define t1 x11 +#define t2 x12 +#define t3 x13 + +// Add in the pure squares 22 + 33 + + mul t1, a2, a2 + adds r0, r0, t1 + mul t2, a3, a3 + umulh t1, a2, a2 + adcs r1, r1, t1 + adcs r2, r2, t2 + umulh t2, a3, a3 + adcs r3, r3, t2 + adc c, c, xzr + +// Construct the 23 term, double and add it in + + mul t1, a2, a3 + umulh t2, a2, a3 + adds t1, t1, t1 + adcs t2, t2, t2 + adc t3, xzr, xzr + + adds r1, r1, t1 + adcs r2, r2, t2 + adcs r3, r3, t3 + adcs c, c, xzr + +// We know, writing B = 2^{4*64} that the full implicit result is +// B^2 c <= z + (B - 1) * p < B * p + (B - 1) * p < 2 * B * p, +// so the top half is certainly < 2 * p. If c = 1 already, we know +// subtracting p will give the reduced modulus. But now we do a +// subtraction-comparison to catch cases where the residue is >= p. +// The constants are such that [t3;0;t1;-1] = p_256. + +#define t0 x5 + +// Set CF (because of inversion) iff (0,p_256) <= (c,r3,r2,r1,r0) + + mov t1, #0x00000000ffffffff + subs t0, r0, #-1 + sbcs t1, r1, t1 + mov t3, #0xffffffff00000001 + sbcs t2, r2, xzr + sbcs t3, r3, t3 + sbcs xzr, c, xzr + +// Select final output accordingly + + csel r0, t0, r0, cs + csel r1, t1, r1, cs + csel r2, t2, r2, cs + csel r3, t3, r3, cs + +// Store things back in place + + stp r0, r1, [x0] + stp r2, r3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/p256_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/p256_montjadd.S new file mode 100644 index 00000000000..053c8bac8d4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/p256_montjadd.S @@ -0,0 +1,612 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjadd) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x21 +#define input_x x22 +#define input_y x23 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + + +#define montmul_p256(P0,P1,P2) \ + add x0, P0;\ + add x1, P1;\ + add x2, P2;\ + bl .montmul_p256 + +#define montsqr_p256(P0,P1) \ + add x0, P0;\ + add x1, P1;\ + bl .montsqr_p256 + +#define sub_p256(P0,P1,P2) \ + add x0, P0;\ + add x1, P1;\ + add x2, P2;\ + bl .sub_p256 + + +// Corresponds exactly to bignum_montmul_p256 + +.montmul_p256: + ldr q20, [x2] + ldp x7, x17, [x1] + ldr q0, [x1] + ldp x6, x10, [x2] + ldp x11, x15, [x1, #16] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x2, #16] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x1, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x2, #16] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x9, x3, x13 + adcs x3, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x12, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x15, x15, xzr + rev64 v24.4s, v20.4s + stp x12, x15, [x0, #16] + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + stp x9, x3, [x0] + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + ldp x15, x8, [x0] + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + ldp x9, x13, [x0, #16] + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x15 + adcs x15, x16, x8 + eor x5, x17, x4 + adcs x9, x1, x9 + eor x1, x10, x5 + adcs x16, x2, x13 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [x0] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [x0, #16] + ret + +// Corresponds exactly to bignum_montsqr_p256 + +.montsqr_p256: + ldr q19, [x1] + ldp x9, x13, [x1] + ldr q23, [x1, #16] + ldr q0, [x1] + ldp x1, x10, [x1, #16] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x2, x5, x2, cs + stp x14, x12, [x0, #16] + stp x16, x2, [x0] + ret + +// Corresponds exactly to bignum_sub_p256 + +.sub_p256: + ldp x5, x6, [x1] + ldp x4, x3, [x2] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [x2, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + ret + + +S2N_BN_SYMBOL(p256_montjadd): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x30, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 12 * multiply + 4 * square + 7 * subtract + + montsqr_p256(z1sq,z_1) + montsqr_p256(z2sq,z_2) + + montmul_p256(y1a,z_2,y_1) + montmul_p256(y2a,z_1,y_2) + + montmul_p256(x2a,z1sq,x_2) + montmul_p256(x1a,z2sq,x_1) + montmul_p256(y2a,z1sq,y2a) + montmul_p256(y1a,z2sq,y1a) + + sub_p256(xd,x2a,x1a) + sub_p256(yd,y2a,y1a) + + montsqr_p256(zz,xd) + montsqr_p256(ww,yd) + + montmul_p256(zzx1,zz,x1a) + montmul_p256(zzx2,zz,x2a) + + sub_p256(resx,ww,zzx1) + sub_p256(t1,zzx2,zzx1) + + montmul_p256(xd,xd,z_1) + + sub_p256(resx,resx,zzx2) + + sub_p256(t2,zzx1,resx) + + montmul_p256(t1,t1,y1a) + montmul_p256(resz,xd,z_2) + montmul_p256(t2,yd,t2) + + sub_p256(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne + + ldp x4, x5, [z_2] + ldp x6, x7, [z_2+16] + + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne + + cmp x13, x12 + +// Multiplex the outputs accordingly, re-using the z's in registers + + ldp x8, x9, [resz] + csel x8, x0, x8, lo + csel x9, x1, x9, lo + csel x8, x4, x8, hi + csel x9, x5, x9, hi + ldp x10, x11, [resz+16] + csel x10, x2, x10, lo + csel x11, x3, x11, lo + csel x10, x6, x10, hi + csel x11, x7, x11, hi + + ldp x12, x13, [x_1] + ldp x0, x1, [resx] + csel x0, x12, x0, lo + csel x1, x13, x1, lo + ldp x12, x13, [x_2] + csel x0, x12, x0, hi + csel x1, x13, x1, hi + + ldp x12, x13, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x12, x2, lo + csel x3, x13, x3, lo + ldp x12, x13, [x_2+16] + csel x2, x12, x2, hi + csel x3, x13, x3, hi + + ldp x12, x13, [y_1] + ldp x4, x5, [resy] + csel x4, x12, x4, lo + csel x5, x13, x5, lo + ldp x12, x13, [y_2] + csel x4, x12, x4, hi + csel x5, x13, x5, hi + + ldp x12, x13, [y_1+16] + ldp x6, x7, [resy+16] + csel x6, x12, x6, lo + csel x7, x13, x7, lo + ldp x12, x13, [y_2+16] + csel x6, x12, x6, hi + csel x7, x13, x7, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [y_3] + stp x6, x7, [y_3+16] + stp x8, x9, [z_3] + stp x10, x11, [z_3+16] + +// Restore registers and return + + add sp, sp, NSPACE + ldp x23, x30, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/p256_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/p256_montjdouble.S new file mode 100644 index 00000000000..befe861db25 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p256/unopt/p256_montjdouble.S @@ -0,0 +1,748 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjdouble +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjdouble) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x19 +#define input_x x20 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 sp, #(NUMSIZE*0) +#define y4 sp, #(NUMSIZE*0) + +#define y2 sp, #(NUMSIZE*1) + +#define t1 sp, #(NUMSIZE*2) + +#define t2 sp, #(NUMSIZE*3) +#define x2p sp, #(NUMSIZE*3) +#define dx2 sp, #(NUMSIZE*3) + +#define xy2 sp, #(NUMSIZE*4) + +#define x4p sp, #(NUMSIZE*5) +#define d_ sp, #(NUMSIZE*5) + +#define NSPACE #(NUMSIZE*6) + +// Corresponds exactly to bignum_montmul_p256 + +.montmul_p256: + ldr q20, [x2] + ldp x7, x17, [x1] + ldr q0, [x1] + ldp x6, x10, [x2] + ldp x11, x15, [x1, #16] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x2, #16] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x1, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x2, #16] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x9, x3, x13 + adcs x3, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x12, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x15, x15, xzr + rev64 v24.4s, v20.4s + stp x12, x15, [x0, #16] + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + stp x9, x3, [x0] + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + ldp x15, x8, [x0] + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + ldp x9, x13, [x0, #16] + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x15 + adcs x15, x16, x8 + eor x5, x17, x4 + adcs x9, x1, x9 + eor x1, x10, x5 + adcs x16, x2, x13 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [x0] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [x0, #16] + ret + +// Corresponds exactly to bignum_montsqr_p256 + +.montsqr_p256: + ldr q19, [x1] + ldp x9, x13, [x1] + ldr q23, [x1, #16] + ldr q0, [x1] + ldp x1, x10, [x1, #16] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x2, x5, x2, cs + stp x14, x12, [x0, #16] + stp x16, x2, [x0] + ret + +// Corresponds exactly to bignum_sub_p256 + +.sub_p256: + ldp x5, x6, [x1] + ldp x4, x3, [x2] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [x2, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + ret + +// Corresponds exactly to bignum_add_p256 + +.add_p256: + ldp x4, x5, [x1] + ldp x8, x9, [x2] + adds x4, x4, x8 + adcs x5, x5, x9 + ldp x6, x7, [x1, #16] + ldp x10, x11, [x2, #16] + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x3, xzr, xzr + adds x8, x4, #0x1 + mov x9, #0xffffffff + sbcs x9, x5, x9 + sbcs x10, x6, xzr + mov x11, #0xffffffff00000001 + sbcs x11, x7, x11 + sbcs x3, x3, xzr + csel x4, x4, x8, cc + csel x5, x5, x9, cc + csel x6, x6, x10, cc + csel x7, x7, x11, cc + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + ret + + +#define montmul_p256(P0,P1,P2) \ + add x0, P0;\ + add x1, P1;\ + add x2, P2;\ + bl .montmul_p256 + +#define montsqr_p256(P0,P1) \ + add x0, P0;\ + add x1, P1;\ + bl .montsqr_p256 + +#define sub_p256(P0,P1,P2) \ + add x0, P0;\ + add x1, P1;\ + add x2, P2;\ + bl .sub_p256 + +#define add_p256(P0,P1,P2) \ + add x0, P0;\ + add x1, P1;\ + add x2, P2;\ + bl .add_p256 + + +// A weak version of add that only guarantees sum in 4 digits + +#define weakadd_p256(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + adds x5, x5, x4 __LF \ + adcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + adcs x7, x7, x4 __LF \ + adcs x8, x8, x3 __LF \ + csetm x3, cs __LF \ + subs x5, x5, x3 __LF \ + and x1, x3, #4294967295 __LF \ + sbcs x6, x6, x1 __LF \ + sbcs x7, x7, xzr __LF \ + and x2, x3, #-4294967295 __LF \ + sbc x8, x8, x2 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// P0 = C * P1 - D * P2 computed as D * (p_256 - P2) + C * P1 +// Quotient estimation is done just as q = h + 1 as in bignum_triple_p256 +// This also applies to the other functions following. + +#define cmsub_p256(P0,C,P1,D,P2) \ + mov x1, D __LF \ + mov x2, #-1 __LF \ + ldp x9, x10, [P2] __LF \ + subs x9, x2, x9 __LF \ + mov x2, #4294967295 __LF \ + sbcs x10, x2, x10 __LF \ + ldp x11, x12, [P2+16] __LF \ + sbcs x11, xzr, x11 __LF \ + mov x2, #-4294967295 __LF \ + sbc x12, x2, x12 __LF \ + mul x3, x1, x9 __LF \ + mul x4, x1, x10 __LF \ + mul x5, x1, x11 __LF \ + mul x6, x1, x12 __LF \ + umulh x9, x1, x9 __LF \ + umulh x10, x1, x10 __LF \ + umulh x11, x1, x11 __LF \ + umulh x7, x1, x12 __LF \ + adds x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + adcs x6, x6, x11 __LF \ + adc x7, x7, xzr __LF \ + mov x1, C __LF \ + ldp x9, x10, [P1] __LF \ + mul x8, x9, x1 __LF \ + umulh x9, x9, x1 __LF \ + adds x3, x3, x8 __LF \ + mul x8, x10, x1 __LF \ + umulh x10, x10, x1 __LF \ + adcs x4, x4, x8 __LF \ + ldp x11, x12, [P1+16] __LF \ + mul x8, x11, x1 __LF \ + umulh x11, x11, x1 __LF \ + adcs x5, x5, x8 __LF \ + mul x8, x12, x1 __LF \ + umulh x12, x12, x1 __LF \ + adcs x6, x6, x8 __LF \ + adc x7, x7, xzr __LF \ + adds x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + adcs x6, x6, x11 __LF \ + adc x7, x7, x12 __LF \ + add x8, x7, #1 __LF \ + lsl x10, x8, #32 __LF \ + adds x6, x6, x10 __LF \ + adc x7, x7, xzr __LF \ + neg x9, x8 __LF \ + sub x10, x10, #1 __LF \ + subs x3, x3, x9 __LF \ + sbcs x4, x4, x10 __LF \ + sbcs x5, x5, xzr __LF \ + sbcs x6, x6, x8 __LF \ + sbc x8, x7, x8 __LF \ + adds x3, x3, x8 __LF \ + and x9, x8, #4294967295 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, xzr __LF \ + neg x10, x9 __LF \ + adc x6, x6, x10 __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +// P0 = 4 * P1 - P2, by direct subtraction of P2; the method +// in bignum_cmul_p256 etc. for quotient estimation still +// works when the value to be reduced is negative, as +// long as it is > -p_256, which is the case here. The +// actual accumulation of q * p_256 is done a bit differently +// so it works for the q = 0 case. + +#define cmsub41_p256(P0,P1,P2) \ + ldp x1, x2, [P1] __LF \ + lsl x0, x1, #2 __LF \ + ldp x6, x7, [P2] __LF \ + subs x0, x0, x6 __LF \ + extr x1, x2, x1, #62 __LF \ + sbcs x1, x1, x7 __LF \ + ldp x3, x4, [P1+16] __LF \ + extr x2, x3, x2, #62 __LF \ + ldp x6, x7, [P2+16] __LF \ + sbcs x2, x2, x6 __LF \ + extr x3, x4, x3, #62 __LF \ + sbcs x3, x3, x7 __LF \ + lsr x4, x4, #62 __LF \ + sbc x4, x4, xzr __LF \ + add x5, x4, #1 __LF \ + lsl x8, x5, #32 __LF \ + subs x6, xzr, x8 __LF \ + sbcs x7, xzr, xzr __LF \ + sbc x8, x8, x5 __LF \ + adds x0, x0, x5 __LF \ + adcs x1, x1, x6 __LF \ + adcs x2, x2, x7 __LF \ + adcs x3, x3, x8 __LF \ + csetm x5, cc __LF \ + adds x0, x0, x5 __LF \ + and x6, x5, #4294967295 __LF \ + adcs x1, x1, x6 __LF \ + adcs x2, x2, xzr __LF \ + neg x7, x6 __LF \ + adc x3, x3, x7 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +// P0 = 3 * P1 - 8 * P2, computed as (p_256 - P2) << 3 + 3 * P1 + +#define cmsub38_p256(P0,P1,P2) \ + mov x1, 8 __LF \ + mov x2, #-1 __LF \ + ldp x9, x10, [P2] __LF \ + subs x9, x2, x9 __LF \ + mov x2, #4294967295 __LF \ + sbcs x10, x2, x10 __LF \ + ldp x11, x12, [P2+16] __LF \ + sbcs x11, xzr, x11 __LF \ + mov x2, #-4294967295 __LF \ + sbc x12, x2, x12 __LF \ + lsl x3, x9, #3 __LF \ + extr x4, x10, x9, #61 __LF \ + extr x5, x11, x10, #61 __LF \ + extr x6, x12, x11, #61 __LF \ + lsr x7, x12, #61 __LF \ + mov x1, 3 __LF \ + ldp x9, x10, [P1] __LF \ + mul x8, x9, x1 __LF \ + umulh x9, x9, x1 __LF \ + adds x3, x3, x8 __LF \ + mul x8, x10, x1 __LF \ + umulh x10, x10, x1 __LF \ + adcs x4, x4, x8 __LF \ + ldp x11, x12, [P1+16] __LF \ + mul x8, x11, x1 __LF \ + umulh x11, x11, x1 __LF \ + adcs x5, x5, x8 __LF \ + mul x8, x12, x1 __LF \ + umulh x12, x12, x1 __LF \ + adcs x6, x6, x8 __LF \ + adc x7, x7, xzr __LF \ + adds x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + adcs x6, x6, x11 __LF \ + adc x7, x7, x12 __LF \ + add x8, x7, #1 __LF \ + lsl x10, x8, #32 __LF \ + adds x6, x6, x10 __LF \ + adc x7, x7, xzr __LF \ + neg x9, x8 __LF \ + sub x10, x10, #1 __LF \ + subs x3, x3, x9 __LF \ + sbcs x4, x4, x10 __LF \ + sbcs x5, x5, xzr __LF \ + sbcs x6, x6, x8 __LF \ + sbc x8, x7, x8 __LF \ + adds x3, x3, x8 __LF \ + and x9, x8, #4294967295 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, xzr __LF \ + neg x10, x9 __LF \ + adc x6, x6, x10 __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +S2N_BN_SYMBOL(p256_montjdouble): + +// Save registers and make room on stack for temporary variables + + sub sp, sp, NSPACE+32 + stp x30, xzr, [sp, NSPACE+16] + stp x19, x20, [sp, NSPACE] + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_p256(z2,z_1) + montsqr_p256(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + sub_p256(t2,x_1,z2) + weakadd_p256(t1,x_1,z2) + montmul_p256(x2p,t1,t2) + +// t1 = y + z +// xy2 = x * y^2 +// x4p = x2p^2 + + add_p256(t1,y_1,z_1) + montmul_p256(xy2,x_1,y2) + montsqr_p256(x4p,x2p) + +// t1 = (y + z)^2 + + montsqr_p256(t1,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p256(d_,12,xy2,9,x4p) + sub_p256(t1,t1,z2) + +// y4 = y^4 + + montsqr_p256(y4,y2) + +// dx2 = d * x2p + + montmul_p256(dx2,d_,x2p) + +// z_3' = 2 * y * z + + sub_p256(z_3,t1,y2) + +// x' = 4 * xy2 - d + + cmsub41_p256(x_3,xy2,d_) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p256(y_3,dx2,y4) + +// Restore registers and stack and return + + ldp x19, x20, [sp, NSPACE] + ldp x30, xzr, [sp, NSPACE+16] + add sp, sp, NSPACE+32 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p384/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/Makefile similarity index 96% rename from third_party/s2n-bignum/arm/p384/Makefile rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/Makefile index 5d64426750c..0606619cf85 100644 --- a/third_party/s2n-bignum/arm/p384/Makefile +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/Makefile @@ -37,10 +37,8 @@ OBJ = bignum_add_p384.o \ bignum_montinv_p384.o \ bignum_montmul_p384.o \ bignum_montmul_p384_alt.o \ - bignum_montmul_p384_neon.o \ bignum_montsqr_p384.o \ bignum_montsqr_p384_alt.o \ - bignum_montsqr_p384_neon.o \ bignum_mux_6.o \ bignum_neg_p384.o \ bignum_nonzero_6.o \ diff --git a/third_party/s2n-bignum/arm/p384/bignum_add_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_add_p384.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_add_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_add_p384.S diff --git a/third_party/s2n-bignum/arm/p384/bignum_bigendian_6.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_bigendian_6.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_bigendian_6.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_bigendian_6.S diff --git a/third_party/s2n-bignum/arm/p384/bignum_cmul_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_cmul_p384.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_cmul_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_cmul_p384.S diff --git a/third_party/s2n-bignum/arm/p384/bignum_deamont_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_deamont_p384.S similarity index 80% rename from third_party/s2n-bignum/arm/p384/bignum_deamont_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_deamont_p384.S index 1f84a4becf9..42d595a6bb1 100644 --- a/third_party/s2n-bignum/arm/p384/bignum_deamont_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_deamont_p384.S @@ -35,27 +35,27 @@ #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ /* Recycle d0 (which we know gets implicitly cancelled) to store it */ \ - lsl t1, d0, #32; \ - add d0, t1, d0; \ + lsl t1, d0, #32 __LF \ + add d0, t1, d0 __LF \ /* Now let [t2;t1] = 2^64 * w - w + w_hi where w_hi = floor(w/2^32) */ \ /* We need to subtract 2^32 * this, and we can ignore its lower 32 */ \ /* bits since by design it will cancel anyway; we only need the w_hi */ \ /* part to get the carry propagation going. */ \ - lsr t1, d0, #32; \ - subs t1, t1, d0; \ - sbc t2, d0, xzr; \ + lsr t1, d0, #32 __LF \ + subs t1, t1, d0 __LF \ + sbc t2, d0, xzr __LF \ /* Now select in t1 the field to subtract from d1 */ \ - extr t1, t2, t1, #32; \ + extr t1, t2, t1, #32 __LF \ /* And now get the terms to subtract from d2 and d3 */ \ - lsr t2, t2, #32; \ - adds t2, t2, d0; \ - adc t3, xzr, xzr; \ + lsr t2, t2, #32 __LF \ + adds t2, t2, d0 __LF \ + adc t3, xzr, xzr __LF \ /* Do the subtraction of that portion */ \ - subs d1, d1, t1; \ - sbcs d2, d2, t2; \ - sbcs d3, d3, t3; \ - sbcs d4, d4, xzr; \ - sbcs d5, d5, xzr; \ + subs d1, d1, t1 __LF \ + sbcs d2, d2, t2 __LF \ + sbcs d3, d3, t3 __LF \ + sbcs d4, d4, xzr __LF \ + sbcs d5, d5, xzr __LF \ /* Now effectively add 2^384 * w by taking d0 as the input for last sbc */ \ sbc d6, d0, xzr diff --git a/third_party/s2n-bignum/arm/p384/bignum_demont_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_demont_p384.S similarity index 77% rename from third_party/s2n-bignum/arm/p384/bignum_demont_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_demont_p384.S index 1b095172881..eca64f62dce 100644 --- a/third_party/s2n-bignum/arm/p384/bignum_demont_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_demont_p384.S @@ -35,27 +35,27 @@ #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ /* Recycle d0 (which we know gets implicitly cancelled) to store it */ \ - lsl t1, d0, #32; \ - add d0, t1, d0; \ + lsl t1, d0, #32 __LF \ + add d0, t1, d0 __LF \ /* Now let [t2;t1] = 2^64 * w - w + w_hi where w_hi = floor(w/2^32) */ \ /* We need to subtract 2^32 * this, and we can ignore its lower 32 */ \ /* bits since by design it will cancel anyway; we only need the w_hi */ \ /* part to get the carry propagation going. */ \ - lsr t1, d0, #32; \ - subs t1, t1, d0; \ - sbc t2, d0, xzr; \ + lsr t1, d0, #32 __LF \ + subs t1, t1, d0 __LF \ + sbc t2, d0, xzr __LF \ /* Now select in t1 the field to subtract from d1 */ \ - extr t1, t2, t1, #32; \ + extr t1, t2, t1, #32 __LF \ /* And now get the terms to subtract from d2 and d3 */ \ - lsr t2, t2, #32; \ - adds t2, t2, d0; \ - adc t3, xzr, xzr; \ + lsr t2, t2, #32 __LF \ + adds t2, t2, d0 __LF \ + adc t3, xzr, xzr __LF \ /* Do the subtraction of that portion */ \ - subs d1, d1, t1; \ - sbcs d2, d2, t2; \ - sbcs d3, d3, t3; \ - sbcs d4, d4, xzr; \ - sbcs d5, d5, xzr; \ + subs d1, d1, t1 __LF \ + sbcs d2, d2, t2 __LF \ + sbcs d3, d3, t3 __LF \ + sbcs d4, d4, xzr __LF \ + sbcs d5, d5, xzr __LF \ /* Now effectively add 2^384 * w by taking d0 as the input for last sbc */ \ sbc d6, d0, xzr diff --git a/third_party/s2n-bignum/arm/p384/bignum_double_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_double_p384.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_double_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_double_p384.S diff --git a/third_party/s2n-bignum/arm/p384/bignum_half_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_half_p384.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_half_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_half_p384.S diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_inv_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_inv_p384.S new file mode 100644 index 00000000000..111b220a304 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_inv_p384.S @@ -0,0 +1,1469 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 +// Input x[6]; output z[6] +// +// extern void bignum_inv_p384(uint64_t z[static 6],uint64_t x[static 6]); +// +// If the 6-digit input x is coprime to p_384, i.e. is not divisible +// by it, returns z < p_384 such that x * z == 1 (mod p_384). Note that +// x does not need to be reduced modulo p_384, but the output always is. +// If the input is divisible (i.e. is 0 or p_384), then there can be no +// modular inverse and z = 0 is returned. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p384) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p384) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Used for the return pointer + +#define res x20 + +// Loop counter and d = 2 * delta value for divstep + +#define i x21 +#define d x22 + +// Registers used for matrix element magnitudes and signs + +#define m00 x10 +#define m01 x11 +#define m10 x12 +#define m11 x13 +#define s00 x14 +#define s01 x15 +#define s10 x16 +#define s11 x17 + +// Initial carries for combinations + +#define car0 x9 +#define car1 x19 + +// Input and output, plain registers treated according to pattern + +#define reg0 x0, #0 +#define reg1 x1, #0 +#define reg2 x2, #0 +#define reg3 x3, #0 +#define reg4 x4, #0 + +#define x x1, #0 +#define z x0, #0 + +// Pointer-offset pairs for temporaries on stack +// The u and v variables are 6 words each as expected, but the f and g +// variables are 8 words each -- they need to have at least one extra +// word for a sign word, and to preserve alignment we "round up" to 8. +// In fact, we currently keep an extra word in u and v as well. + +#define f sp, #0 +#define g sp, #(8*N) +#define u sp, #(16*N) +#define v sp, #(24*N) + +// Total size to reserve on the stack + +#define NSPACE #(32*N) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro. Takes input in +// [d6;d5;d4;d3;d2;d1;d0] and returns result in [d6;d5d4;d3;d2;d1], adding +// to the existing [d6;d5;d4;d3;d2;d1], and re-using d0 as a temporary +// internally as well as t0, t1, t2. This is almost-Montgomery, i.e. the +// result fits in 6 digits but is not necessarily strictly reduced mod p_384. +// --------------------------------------------------------------------------- + +#define amontred(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ +/* We only know the input is -2^444 < x < 2^444. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_384. */ \ + mov t1, #0xe000000000000000 __LF \ + adds d0, d0, t1 __LF \ + mov t2, #0x000000001fffffff __LF \ + adcs d1, d1, t2 __LF \ + mov t3, #0xffffffffe0000000 __LF \ + bic t3, t3, #0x2000000000000000 __LF \ + adcs d2, d2, t3 __LF \ + sbcs d3, d3, xzr __LF \ + sbcs d4, d4, xzr __LF \ + sbcs d5, d5, xzr __LF \ + mov t1, #0x1fffffffffffffff __LF \ + adc d6, d6, t1 __LF \ +/* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ +/* Store it back into d0 since we no longer need that digit. */ \ + add d0, d0, d0, lsl #32 __LF \ +/* Now let [t3;t2;t1;-] = (2^384 - p_384) * w */ \ +/* We know the lowest word will cancel d0 so we don't need it */ \ + mov t1, #0xffffffff00000001 __LF \ + umulh t1, t1, d0 __LF \ + mov t2, #0x00000000ffffffff __LF \ + mul t3, t2, d0 __LF \ + umulh t2, t2, d0 __LF \ + adds t1, t1, t3 __LF \ + adcs t2, t2, d0 __LF \ + cset t3, cs __LF \ +/* Now x + p_384 * w = (x + 2^384 * w) - (2^384 - p_384) * w */ \ +/* We catch the net top carry from add-subtract in the digit d0 */ \ + adds d6, d6, d0 __LF \ + cset d0, cs __LF \ + subs d1, d1, t1 __LF \ + sbcs d2, d2, t2 __LF \ + sbcs d3, d3, t3 __LF \ + sbcs d4, d4, xzr __LF \ + sbcs d5, d5, xzr __LF \ + sbcs d6, d6, xzr __LF \ + sbcs d0, d0, xzr __LF \ +/* Now if d0 is nonzero we subtract p_384 (almost-Montgomery) */ \ + neg d0, d0 __LF \ + and t1, d0, #0x00000000ffffffff __LF \ + and t2, d0, #0xffffffff00000000 __LF \ + and t3, d0, #0xfffffffffffffffe __LF \ + subs d1, d1, t1 __LF \ + sbcs d2, d2, t2 __LF \ + sbcs d3, d3, t3 __LF \ + sbcs d4, d4, d0 __LF \ + sbcs d5, d5, d0 __LF \ + sbc d6, d6, d0 + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix in +// registers as follows +// +// [ m00 m01] +// [ m10 m11] + +#define divstep59() \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x8, x4, #0x100, lsl #12 __LF \ + sbfx x8, x8, #21, #21 __LF \ + mov x11, #0x100000 __LF \ + add x11, x11, x11, lsl #21 __LF \ + add x9, x4, x11 __LF \ + asr x9, x9, #42 __LF \ + add x10, x5, #0x100, lsl #12 __LF \ + sbfx x10, x10, #21, #21 __LF \ + add x11, x5, x11 __LF \ + asr x11, x11, #42 __LF \ + mul x6, x8, x2 __LF \ + mul x7, x9, x3 __LF \ + mul x2, x10, x2 __LF \ + mul x3, x11, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #21, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #42 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #21, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #42 __LF \ + mul x6, x12, x2 __LF \ + mul x7, x13, x3 __LF \ + mul x2, x14, x2 __LF \ + mul x3, x15, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + mul x2, x12, x8 __LF \ + mul x3, x12, x9 __LF \ + mul x6, x14, x8 __LF \ + mul x7, x14, x9 __LF \ + madd x8, x13, x10, x2 __LF \ + madd x9, x13, x11, x3 __LF \ + madd x16, x15, x10, x6 __LF \ + madd x17, x15, x11, x7 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #22, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #43 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #22, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #43 __LF \ + mneg x2, x12, x8 __LF \ + mneg x3, x12, x9 __LF \ + mneg x4, x14, x8 __LF \ + mneg x5, x14, x9 __LF \ + msub m00, x13, x16, x2 __LF \ + msub m01, x13, x17, x3 __LF \ + msub m10, x15, x16, x4 __LF \ + msub m11, x15, x17, x5 + +S2N_BN_SYMBOL(bignum_inv_p384): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Copy the prime and input into the main f and g variables respectively. +// Make sure x is reduced so that g <= f as assumed in the bound proof. + + mov x10, #0x00000000ffffffff + mov x11, #0xffffffff00000000 + mov x12, #0xfffffffffffffffe + mov x15, #0xffffffffffffffff + stp x10, x11, [f] + stp x12, x15, [f+2*N] + stp x15, x15, [f+4*N] + str xzr, [f+6*N] + + ldp x2, x3, [x1] + subs x10, x2, x10 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #(2*N)] + sbcs x12, x4, x12 + sbcs x13, x5, x15 + ldp x6, x7, [x1, #(4*N)] + sbcs x14, x6, x15 + sbcs x15, x7, x15 + + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + csel x6, x6, x14, cc + csel x7, x7, x15, cc + + stp x2, x3, [g] + stp x4, x5, [g+2*N] + stp x6, x7, [g+4*N] + str xzr, [g+6*N] + +// Also maintain reduced < 2^384 vector [u,v] such that +// [f,g] == x * 2^{5*i-75} * [u,v] (mod p_384) +// starting with [p_384,x] == x * 2^{5*0-75} * [0,2^75] (mod p_384) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. + + stp xzr, xzr, [u] + stp xzr, xzr, [u+2*N] + stp xzr, xzr, [u+4*N] + + mov x10, #2048 + stp xzr, x10, [v] + stp xzr, xzr, [v+2*N] + stp xzr, xzr, [v+4*N] + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special fifteenth iteration after a uniform +// first 14. + + mov i, #15 + mov d, #1 + b bignum_inv_p384_midloop + +bignum_inv_p384_loop: + +// Separate the matrix elements into sign-magnitude pairs + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in stable registers for the [u,v] part and do [f,g] first. + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + + and x0, m10, s10 + and x1, m11, s11 + add car1, x0, x1 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + ldr x7, [f] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [g] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + +// Digit 1 of [f,g] + + ldr x7, [f+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g] + +// Digit 2 of [f,g] + + ldr x7, [f+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+N] + +// Digit 3 of [f,g] + + ldr x7, [f+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [g+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+2*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x6, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [g+2*N] + +// Digit 4 of [f,g] + + ldr x7, [f+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [g+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [f+3*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x6, x6, x0 + adc x5, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [g+3*N] + +// Digits 5 and 6 of [f,g] + + ldr x7, [f+5*N] + eor x1, x7, s00 + ldr x23, [f+6*N] + eor x2, x23, s00 + and x2, x2, m00 + neg x2, x2 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, x2, x1 + ldr x8, [g+5*N] + eor x1, x8, s01 + ldr x24, [g+6*N] + eor x0, x24, s01 + and x0, x0, m01 + sub x2, x2, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [f+4*N] + extr x4, x2, x4, #59 + str x4, [f+5*N] + asr x2, x2, #59 + str x2, [f+6*N] + + eor x1, x7, s10 + eor x4, x23, s10 + and x4, x4, m10 + neg x4, x4 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, x5, x0 + adc x4, x4, x1 + eor x1, x8, s11 + eor x0, x24, s11 + and x0, x0, m11 + sub x4, x4, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x4, x4, x1 + extr x6, x5, x6, #59 + str x6, [g+4*N] + extr x5, x4, x5, #59 + str x5, [g+5*N] + asr x4, x4, #59 + str x4, [g+6*N] + +// Now the computation of the updated u and v values and their +// Montgomery reductions. A very similar accumulation except that +// the top words of u and v are unsigned and we don't shift. +// +// Digit 0 of [u,v] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v] + adc x3, x3, x1 + +// Digit 1 of [u,v] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+N] + adc x4, x4, x1 + +// Digit 2 of [u,v] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+2*N] + adc x2, x2, x1 + +// Digit 3 of [u,v] + + ldr x7, [u+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + str x5, [u+3*N] + adc x3, x3, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x6, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + str x2, [v+3*N] + adc x6, x6, x1 + +// Digit 4 of [u,v] + + ldr x7, [u+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [v+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + str x3, [u+4*N] + adc x4, x4, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x6, x6, x0 + adc x5, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x6, x6, x0 + str x6, [v+4*N] + adc x5, x5, x1 + +// Digits 5 and 6 of [u,v] (top is unsigned) + + ldr x7, [u+5*N] + eor x1, x7, s00 + and x2, s00, m00 + neg x2, x2 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, x2, x1 + ldr x8, [v+5*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x2, x2, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u+5*N] + adc x2, x2, x1 + str x2, [u+6*N] + + eor x1, x7, s10 + and x4, s10, m10 + neg x4, x4 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, x5, x0 + adc x4, x4, x1 + eor x1, x8, s11 + and x0, s11, m11 + sub x4, x4, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v+5*N] + adc x4, x4, x1 + str x4, [v+6*N] + +// Montgomery reduction of u + + ldp x0, x1, [u] + ldp x2, x3, [u+16] + ldp x4, x5, [u+32] + ldr x6, [u+48] + amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7) + stp x1, x2, [u] + stp x3, x4, [u+16] + stp x5, x6, [u+32] + +// Montgomery reduction of v + + ldp x0, x1, [v] + ldp x2, x3, [v+16] + ldp x4, x5, [v+32] + ldr x6, [v+48] + amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7) + stp x1, x2, [v] + stp x3, x4, [v+16] + stp x5, x6, [v+32] + +bignum_inv_p384_midloop: + + mov x1, d + ldr x2, [f] + ldr x3, [g] + divstep59() + mov d, x1 + +// Next iteration + + subs i, i, #1 + bne bignum_inv_p384_loop + +// The 15th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + ldr x0, [f] + ldr x1, [g] + mul x0, x0, m00 + madd x1, x1, m01, x0 + asr x0, x1, #63 + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * [u,v] (mod p_384) +// we want to flip the sign of u according to that of f. + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + eor s00, s00, x0 + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + eor s01, s01, x0 + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + eor s10, s10, x0 + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + eor s11, s11, x0 + +// Adjust the initial value to allow for complement instead of negation + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + +// Digit 0 of [u] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + +// Digit 1 of [u] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + +// Digit 2 of [u] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + +// Digit 3 of [u] + + ldr x7, [u+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + str x5, [u+3*N] + adc x3, x3, x1 + +// Digit 4 of [u] + + ldr x7, [u+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [v+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + str x3, [u+4*N] + adc x4, x4, x1 + +// Digits 5 and 6 of [u] (top is unsigned) + + ldr x7, [u+5*N] + eor x1, x7, s00 + and x2, s00, m00 + neg x2, x2 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, x2, x1 + ldr x8, [v+5*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x2, x2, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u+5*N] + adc x2, x2, x1 + str x2, [u+6*N] + +// Montgomery reduction of u. This needs to be strict not "almost" +// so it is followed by an optional subtraction of p_384 + + ldp x10, x0, [u] + ldp x1, x2, [u+16] + ldp x3, x4, [u+32] + ldr x5, [u+48] + amontred(x5,x4,x3,x2,x1,x0,x10, x9,x8,x7) + + mov x10, #0x00000000ffffffff + subs x10, x0, x10 + mov x11, #0xffffffff00000000 + sbcs x11, x1, x11 + mov x12, #0xfffffffffffffffe + sbcs x12, x2, x12 + mov x15, #0xffffffffffffffff + sbcs x13, x3, x15 + sbcs x14, x4, x15 + sbcs x15, x5, x15 + + csel x0, x0, x10, cc + csel x1, x1, x11, cc + csel x2, x2, x12, cc + csel x3, x3, x13, cc + csel x4, x4, x14, cc + csel x5, x5, x15, cc + +// Store it back to the final output + + stp x0, x1, [res] + stp x2, x3, [res, #16] + stp x4, x5, [res, #32] + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p384/bignum_littleendian_6.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_littleendian_6.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_littleendian_6.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_littleendian_6.S diff --git a/third_party/s2n-bignum/arm/p384/bignum_mod_n384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_n384.S similarity index 87% rename from third_party/s2n-bignum/arm/p384/bignum_mod_n384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_n384.S index a91bb2c5b5a..9aaa029e232 100644 --- a/third_party/s2n-bignum/arm/p384/bignum_mod_n384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_n384.S @@ -59,9 +59,9 @@ // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 S2N_BN_SYMBOL(bignum_mod_n384): @@ -71,7 +71,7 @@ S2N_BN_SYMBOL(bignum_mod_n384_alt): // If the input is already <= 5 words long, go to a trivial "copy" path cmp k, #6 - bcc short + bcc bignum_mod_n384_short // Otherwise load the top 6 digits (top-down) and reduce k by 6 @@ -105,8 +105,8 @@ S2N_BN_SYMBOL(bignum_mod_n384_alt): // Now do (k-6) iterations of 7->6 word modular reduction - cbz k, writeback -loop: + cbz k, bignum_mod_n384_writeback +bignum_mod_n384_loop: // Compute q = min (m5 + 1) (2^64 - 1) @@ -161,11 +161,11 @@ loop: sbc m5, m4, xzr mov m4, t - cbnz k, loop + cbnz k, bignum_mod_n384_loop // Finally write back [m5;m4;m3;m2;m1;m0] and return -writeback: +bignum_mod_n384_writeback: stp m0, m1, [z] stp m2, m3, [z, #16] stp m4, m5, [z, #32] @@ -174,7 +174,7 @@ writeback: // Short case: just copy the input with zero-padding -short: +bignum_mod_n384_short: mov m0, xzr mov m1, xzr mov m2, xzr @@ -182,21 +182,21 @@ short: mov m4, xzr mov m5, xzr - cbz k, writeback + cbz k, bignum_mod_n384_writeback ldr m0, [x] subs k, k, #1 - beq writeback + beq bignum_mod_n384_writeback ldr m1, [x, #8] subs k, k, #1 - beq writeback + beq bignum_mod_n384_writeback ldr m2, [x, #16] subs k, k, #1 - beq writeback + beq bignum_mod_n384_writeback ldr m3, [x, #24] subs k, k, #1 - beq writeback + beq bignum_mod_n384_writeback ldr m4, [x, #32] - b writeback + b bignum_mod_n384_writeback #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/third_party/s2n-bignum/arm/p384/bignum_mod_n384_6.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_n384_6.S similarity index 91% rename from third_party/s2n-bignum/arm/p384/bignum_mod_n384_6.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_n384_6.S index e79ad3fe853..ad9e4b9700e 100644 --- a/third_party/s2n-bignum/arm/p384/bignum_mod_n384_6.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_n384_6.S @@ -37,9 +37,9 @@ #define d5 x13 #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 S2N_BN_SYMBOL(bignum_mod_n384_6): diff --git a/third_party/s2n-bignum/arm/p384/bignum_mod_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_p384.S similarity index 90% rename from third_party/s2n-bignum/arm/p384/bignum_mod_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_p384.S index cf7f1d6bbbf..a92548684d3 100644 --- a/third_party/s2n-bignum/arm/p384/bignum_mod_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_p384.S @@ -49,7 +49,7 @@ S2N_BN_SYMBOL(bignum_mod_p384_alt): // If the input is already <= 5 words long, go to a trivial "copy" path cmp k, #6 - bcc short + bcc bignum_mod_p384_short // Otherwise load the top 6 digits (top-down) and reduce k by 6 @@ -83,8 +83,8 @@ S2N_BN_SYMBOL(bignum_mod_p384_alt): // Now do (k-6) iterations of 7->6 word modular reduction - cbz k, writeback -loop: + cbz k, bignum_mod_p384_writeback +bignum_mod_p384_loop: // Decrement k and load the next digit as t5. We now want to reduce // [m5;m4;m3;m2;m1;m0;t5] |-> [m5;m4;m3;m2;m1;m0]; the shuffling downwards is @@ -134,11 +134,11 @@ loop: adcs m4, t4, n1 adc m5, t5, n1 - cbnz k, loop + cbnz k, bignum_mod_p384_loop // Finally write back [m5;m4;m3;m2;m1;m0] and return -writeback: +bignum_mod_p384_writeback: stp m0, m1, [z] stp m2, m3, [z, #16] stp m4, m5, [z, #32] @@ -147,7 +147,7 @@ writeback: // Short case: just copy the input with zero-padding -short: +bignum_mod_p384_short: mov m0, xzr mov m1, xzr mov m2, xzr @@ -155,21 +155,21 @@ short: mov m4, xzr mov m5, xzr - cbz k, writeback + cbz k, bignum_mod_p384_writeback ldr m0, [x] subs k, k, #1 - beq writeback + beq bignum_mod_p384_writeback ldr m1, [x, #8] subs k, k, #1 - beq writeback + beq bignum_mod_p384_writeback ldr m2, [x, #16] subs k, k, #1 - beq writeback + beq bignum_mod_p384_writeback ldr m3, [x, #24] subs k, k, #1 - beq writeback + beq bignum_mod_p384_writeback ldr m4, [x, #32] - b writeback + b bignum_mod_p384_writeback #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/third_party/s2n-bignum/arm/p384/bignum_mod_p384_6.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_p384_6.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_mod_p384_6.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mod_p384_6.S diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montinv_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montinv_p384.S new file mode 100644 index 00000000000..fd572e9677b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montinv_p384.S @@ -0,0 +1,1487 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 +// Input x[6]; output z[6] +// +// extern void bignum_montinv_p384(uint64_t z[static 6],uint64_t x[static 6]); +// +// If the 6-digit input x is coprime to p_384, i.e. is not divisible +// by it, returns z < p_384 such that x * z == 2^768 (mod p_384). This +// is effectively "Montgomery inverse" because if we consider x and z as +// Montgomery forms of X and Z, i.e. x == 2^384 * X and z == 2^384 * Z +// (both mod p_384) then X * Z == 1 (mod p_384). That is, this function +// gives the analog of the modular inverse bignum_inv_p384 but with both +// input and output in the Montgomery domain. Note that x does not need +// to be reduced modulo p_384, but the output always is. If the input +// is divisible (i.e. is 0 or p_384), then there can be no solution to +// the congruence x * z == 2^768 (mod p_384), and z = 0 is returned. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_p384) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_p384) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Used for the return pointer + +#define res x20 + +// Loop counter and d = 2 * delta value for divstep + +#define i x21 +#define d x22 + +// Registers used for matrix element magnitudes and signs + +#define m00 x10 +#define m01 x11 +#define m10 x12 +#define m11 x13 +#define s00 x14 +#define s01 x15 +#define s10 x16 +#define s11 x17 + +// Initial carries for combinations + +#define car0 x9 +#define car1 x19 + +// Input and output, plain registers treated according to pattern + +#define reg0 x0, #0 +#define reg1 x1, #0 +#define reg2 x2, #0 +#define reg3 x3, #0 +#define reg4 x4, #0 + +#define x x1, #0 +#define z x0, #0 + +// Pointer-offset pairs for temporaries on stack +// The u and v variables are 6 words each as expected, but the f and g +// variables are 8 words each -- they need to have at least one extra +// word for a sign word, and to preserve alignment we "round up" to 8. +// In fact, we currently keep an extra word in u and v as well. + +#define f sp, #0 +#define g sp, #(8*N) +#define u sp, #(16*N) +#define v sp, #(24*N) + +// Total size to reserve on the stack + +#define NSPACE #(32*N) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro. Takes input in +// [d6;d5;d4;d3;d2;d1;d0] and returns result in [d6;d5d4;d3;d2;d1], adding +// to the existing [d6;d5;d4;d3;d2;d1], and re-using d0 as a temporary +// internally as well as t0, t1, t2. This is almost-Montgomery, i.e. the +// result fits in 6 digits but is not necessarily strictly reduced mod p_384. +// --------------------------------------------------------------------------- + +#define amontred(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ +/* We only know the input is -2^444 < x < 2^444. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_384. */ \ + mov t1, #0xe000000000000000 __LF \ + adds d0, d0, t1 __LF \ + mov t2, #0x000000001fffffff __LF \ + adcs d1, d1, t2 __LF \ + mov t3, #0xffffffffe0000000 __LF \ + bic t3, t3, #0x2000000000000000 __LF \ + adcs d2, d2, t3 __LF \ + sbcs d3, d3, xzr __LF \ + sbcs d4, d4, xzr __LF \ + sbcs d5, d5, xzr __LF \ + mov t1, #0x1fffffffffffffff __LF \ + adc d6, d6, t1 __LF \ +/* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ +/* Store it back into d0 since we no longer need that digit. */ \ + add d0, d0, d0, lsl #32 __LF \ +/* Now let [t3;t2;t1;-] = (2^384 - p_384) * w */ \ +/* We know the lowest word will cancel d0 so we don't need it */ \ + mov t1, #0xffffffff00000001 __LF \ + umulh t1, t1, d0 __LF \ + mov t2, #0x00000000ffffffff __LF \ + mul t3, t2, d0 __LF \ + umulh t2, t2, d0 __LF \ + adds t1, t1, t3 __LF \ + adcs t2, t2, d0 __LF \ + cset t3, cs __LF \ +/* Now x + p_384 * w = (x + 2^384 * w) - (2^384 - p_384) * w */ \ +/* We catch the net top carry from add-subtract in the digit d0 */ \ + adds d6, d6, d0 __LF \ + cset d0, cs __LF \ + subs d1, d1, t1 __LF \ + sbcs d2, d2, t2 __LF \ + sbcs d3, d3, t3 __LF \ + sbcs d4, d4, xzr __LF \ + sbcs d5, d5, xzr __LF \ + sbcs d6, d6, xzr __LF \ + sbcs d0, d0, xzr __LF \ +/* Now if d0 is nonzero we subtract p_384 (almost-Montgomery) */ \ + neg d0, d0 __LF \ + and t1, d0, #0x00000000ffffffff __LF \ + and t2, d0, #0xffffffff00000000 __LF \ + and t3, d0, #0xfffffffffffffffe __LF \ + subs d1, d1, t1 __LF \ + sbcs d2, d2, t2 __LF \ + sbcs d3, d3, t3 __LF \ + sbcs d4, d4, d0 __LF \ + sbcs d5, d5, d0 __LF \ + sbc d6, d6, d0 + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix in +// registers as follows +// +// [ m00 m01] +// [ m10 m11] + +#define divstep59() \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x8, x4, #0x100, lsl #12 __LF \ + sbfx x8, x8, #21, #21 __LF \ + mov x11, #0x100000 __LF \ + add x11, x11, x11, lsl #21 __LF \ + add x9, x4, x11 __LF \ + asr x9, x9, #42 __LF \ + add x10, x5, #0x100, lsl #12 __LF \ + sbfx x10, x10, #21, #21 __LF \ + add x11, x5, x11 __LF \ + asr x11, x11, #42 __LF \ + mul x6, x8, x2 __LF \ + mul x7, x9, x3 __LF \ + mul x2, x10, x2 __LF \ + mul x3, x11, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #21, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #42 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #21, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #42 __LF \ + mul x6, x12, x2 __LF \ + mul x7, x13, x3 __LF \ + mul x2, x14, x2 __LF \ + mul x3, x15, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + mul x2, x12, x8 __LF \ + mul x3, x12, x9 __LF \ + mul x6, x14, x8 __LF \ + mul x7, x14, x9 __LF \ + madd x8, x13, x10, x2 __LF \ + madd x9, x13, x11, x3 __LF \ + madd x16, x15, x10, x6 __LF \ + madd x17, x15, x11, x7 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #22, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #43 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #22, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #43 __LF \ + mneg x2, x12, x8 __LF \ + mneg x3, x12, x9 __LF \ + mneg x4, x14, x8 __LF \ + mneg x5, x14, x9 __LF \ + msub m00, x13, x16, x2 __LF \ + msub m01, x13, x17, x3 __LF \ + msub m10, x15, x16, x4 __LF \ + msub m11, x15, x17, x5 + +S2N_BN_SYMBOL(bignum_montinv_p384): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Copy the prime and input into the main f and g variables respectively. +// Make sure x is reduced so that g <= f as assumed in the bound proof. + + mov x10, #0x00000000ffffffff + mov x11, #0xffffffff00000000 + mov x12, #0xfffffffffffffffe + mov x15, #0xffffffffffffffff + stp x10, x11, [f] + stp x12, x15, [f+2*N] + stp x15, x15, [f+4*N] + str xzr, [f+6*N] + + ldp x2, x3, [x1] + subs x10, x2, x10 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #(2*N)] + sbcs x12, x4, x12 + sbcs x13, x5, x15 + ldp x6, x7, [x1, #(4*N)] + sbcs x14, x6, x15 + sbcs x15, x7, x15 + + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + csel x6, x6, x14, cc + csel x7, x7, x15, cc + + stp x2, x3, [g] + stp x4, x5, [g+2*N] + stp x6, x7, [g+4*N] + str xzr, [g+6*N] + +// Also maintain reduced < 2^384 vector [u,v] such that +// [f,g] == x * 2^{5*i-843} * [u,v] (mod p_384) +// starting with [p_384,x] == x * 2^{5*0-843} * [0,2^843] (mod p_384) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. +// After the 15th and last iteration and sign adjustment, when +// f == 1 for in-scope cases, we have x * 2^{75-843} * u == 1, i.e. +// x * u == 2^768 as required. + + stp xzr, xzr, [u] + stp xzr, xzr, [u+2*N] + stp xzr, xzr, [u+4*N] + +// The starting constant 2^843 mod p_384 is +// 0x0000000000000800:00001000000007ff:fffff00000000000 +// :00001000000007ff:fffff00000000800:0000000000000000 +// where colons separate 64-bit subwords, least significant at the right. +// Not all of these are single loads on ARM so this is a bit dynamic + + mov x12, #0xfffff00000000000 + orr x10, x12, #0x0000000000000800 + stp xzr, x10, [v] + mov x11, #0x00000000000007ff + orr x11, x11, #0x0000100000000000 + stp x11, x12, [v+2*N] + mov x12, #0x0000000000000800 + stp x11, x12, [v+4*N] + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special fifteenth iteration after a uniform +// first 14. + + mov i, #15 + mov d, #1 + b bignum_montinv_p384_midloop + +bignum_montinv_p384_loop: + +// Separate the matrix elements into sign-magnitude pairs + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in stable registers for the [u,v] part and do [f,g] first. + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + + and x0, m10, s10 + and x1, m11, s11 + add car1, x0, x1 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + ldr x7, [f] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [g] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + +// Digit 1 of [f,g] + + ldr x7, [f+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g] + +// Digit 2 of [f,g] + + ldr x7, [f+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+N] + +// Digit 3 of [f,g] + + ldr x7, [f+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [g+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+2*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x6, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [g+2*N] + +// Digit 4 of [f,g] + + ldr x7, [f+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [g+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [f+3*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x6, x6, x0 + adc x5, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [g+3*N] + +// Digits 5 and 6 of [f,g] + + ldr x7, [f+5*N] + eor x1, x7, s00 + ldr x23, [f+6*N] + eor x2, x23, s00 + and x2, x2, m00 + neg x2, x2 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, x2, x1 + ldr x8, [g+5*N] + eor x1, x8, s01 + ldr x24, [g+6*N] + eor x0, x24, s01 + and x0, x0, m01 + sub x2, x2, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [f+4*N] + extr x4, x2, x4, #59 + str x4, [f+5*N] + asr x2, x2, #59 + str x2, [f+6*N] + + eor x1, x7, s10 + eor x4, x23, s10 + and x4, x4, m10 + neg x4, x4 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, x5, x0 + adc x4, x4, x1 + eor x1, x8, s11 + eor x0, x24, s11 + and x0, x0, m11 + sub x4, x4, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x4, x4, x1 + extr x6, x5, x6, #59 + str x6, [g+4*N] + extr x5, x4, x5, #59 + str x5, [g+5*N] + asr x4, x4, #59 + str x4, [g+6*N] + +// Now the computation of the updated u and v values and their +// Montgomery reductions. A very similar accumulation except that +// the top words of u and v are unsigned and we don't shift. +// +// Digit 0 of [u,v] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v] + adc x3, x3, x1 + +// Digit 1 of [u,v] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+N] + adc x4, x4, x1 + +// Digit 2 of [u,v] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+2*N] + adc x2, x2, x1 + +// Digit 3 of [u,v] + + ldr x7, [u+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + str x5, [u+3*N] + adc x3, x3, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x6, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + str x2, [v+3*N] + adc x6, x6, x1 + +// Digit 4 of [u,v] + + ldr x7, [u+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [v+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + str x3, [u+4*N] + adc x4, x4, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x6, x6, x0 + adc x5, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x6, x6, x0 + str x6, [v+4*N] + adc x5, x5, x1 + +// Digits 5 and 6 of [u,v] (top is unsigned) + + ldr x7, [u+5*N] + eor x1, x7, s00 + and x2, s00, m00 + neg x2, x2 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, x2, x1 + ldr x8, [v+5*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x2, x2, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u+5*N] + adc x2, x2, x1 + str x2, [u+6*N] + + eor x1, x7, s10 + and x4, s10, m10 + neg x4, x4 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, x5, x0 + adc x4, x4, x1 + eor x1, x8, s11 + and x0, s11, m11 + sub x4, x4, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v+5*N] + adc x4, x4, x1 + str x4, [v+6*N] + +// Montgomery reduction of u + + ldp x0, x1, [u] + ldp x2, x3, [u+16] + ldp x4, x5, [u+32] + ldr x6, [u+48] + amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7) + stp x1, x2, [u] + stp x3, x4, [u+16] + stp x5, x6, [u+32] + +// Montgomery reduction of v + + ldp x0, x1, [v] + ldp x2, x3, [v+16] + ldp x4, x5, [v+32] + ldr x6, [v+48] + amontred(x6,x5,x4,x3,x2,x1,x0, x9,x8,x7) + stp x1, x2, [v] + stp x3, x4, [v+16] + stp x5, x6, [v+32] + +bignum_montinv_p384_midloop: + + mov x1, d + ldr x2, [f] + ldr x3, [g] + divstep59() + mov d, x1 + +// Next iteration + + subs i, i, #1 + bne bignum_montinv_p384_loop + +// The 15th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + ldr x0, [f] + ldr x1, [g] + mul x0, x0, m00 + madd x1, x1, m01, x0 + asr x0, x1, #63 + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * 2^{-768} [u,v] (mod p_384) +// we want to flip the sign of u according to that of f. + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + eor s00, s00, x0 + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + eor s01, s01, x0 + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + eor s10, s10, x0 + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + eor s11, s11, x0 + +// Adjust the initial value to allow for complement instead of negation + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + +// Digit 0 of [u] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + +// Digit 1 of [u] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + +// Digit 2 of [u] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + +// Digit 3 of [u] + + ldr x7, [u+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + str x5, [u+3*N] + adc x3, x3, x1 + +// Digit 4 of [u] + + ldr x7, [u+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [v+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + str x3, [u+4*N] + adc x4, x4, x1 + +// Digits 5 and 6 of [u] (top is unsigned) + + ldr x7, [u+5*N] + eor x1, x7, s00 + and x2, s00, m00 + neg x2, x2 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, x2, x1 + ldr x8, [v+5*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x2, x2, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u+5*N] + adc x2, x2, x1 + str x2, [u+6*N] + +// Montgomery reduction of u. This needs to be strict not "almost" +// so it is followed by an optional subtraction of p_384 + + ldp x10, x0, [u] + ldp x1, x2, [u+16] + ldp x3, x4, [u+32] + ldr x5, [u+48] + amontred(x5,x4,x3,x2,x1,x0,x10, x9,x8,x7) + + mov x10, #0x00000000ffffffff + subs x10, x0, x10 + mov x11, #0xffffffff00000000 + sbcs x11, x1, x11 + mov x12, #0xfffffffffffffffe + sbcs x12, x2, x12 + mov x15, #0xffffffffffffffff + sbcs x13, x3, x15 + sbcs x14, x4, x15 + sbcs x15, x5, x15 + + csel x0, x0, x10, cc + csel x1, x1, x11, cc + csel x2, x2, x12, cc + csel x3, x3, x13, cc + csel x4, x4, x14, cc + csel x5, x5, x15, cc + +// Store it back to the final output + + stp x0, x1, [res] + stp x2, x3, [res, #16] + stp x4, x5, [res, #32] + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montmul_p384.S similarity index 97% rename from third_party/s2n-bignum/arm/p384/bignum_montmul_p384_neon.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montmul_p384.S index 08c296bc0d2..60a960c5223 100644 --- a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384_neon.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montmul_p384.S @@ -5,7 +5,7 @@ // Montgomery multiply, z := (x * y / 2^384) mod p_384 // Inputs x[6], y[6]; output z[6] // -// extern void bignum_montmul_p384_neon +// extern void bignum_montmul_p384 // (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]); // // Does z := (2^{-384} * x * y) mod p_384, assuming that the inputs x and y @@ -15,7 +15,8 @@ // Standard ARM ABI: X0 = z, X1 = x, X2 = y // ---------------------------------------------------------------------------- -// bignum_montmul_p384_neon is functionally equivalent to bignum_montmul_p384. +// bignum_montmul_p384 is functionally equivalent to +// unopt/bignum_montmul_p384_base. // It is written in a way that // 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully // chosen and vectorized @@ -24,9 +25,9 @@ // // The output program of step 1. is as follows: // -// stp x19, x20, [sp, #-16]! -// stp x21, x22, [sp, #-16]! -// stp x23, x24, [sp, #-16]! +// stp x19, x20, [sp, #-16]! +// stp x21, x22, [sp, #-16]! +// stp x23, x24, [sp, #-16]! // ldp x3, x21, [x1] // ldr q30, [x1] // ldp x8, x24, [x1, #16] @@ -433,9 +434,9 @@ // stp x10, x5, [x0] // @slothy:writes=buffer0 // stp x24, x8, [x0, #16] // @slothy:writes=buffer16 // stp x21, x2, [x0, #32] // @slothy:writes=buffer32 -// ldp x23, x24, [sp], #16 -// ldp x21, x22, [sp], #16 -// ldp x19, x20, [sp], #16 +// ldp x23, x24, [sp], #16 +// ldp x21, x22, [sp], #16 +// ldp x19, x20, [sp], #16 // ret // // The bash script used for step 2 is as follows: @@ -452,12 +453,12 @@ #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p384_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p384_neon) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p384) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p384) .text .balign 4 -S2N_BN_SYMBOL(bignum_montmul_p384_neon): +S2N_BN_SYMBOL(bignum_montmul_p384): // Save some registers diff --git a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montmul_p384_alt.S similarity index 88% rename from third_party/s2n-bignum/arm/p384/bignum_montmul_p384_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montmul_p384_alt.S index a6464f07cc7..c44ca21f249 100644 --- a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montmul_p384_alt.S @@ -34,24 +34,24 @@ #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ /* Store it in d6 to make the 2^384 * w contribution already */ \ - lsl t1, d0, #32; \ - add d6, t1, d0; \ + lsl t1, d0, #32 __LF \ + add d6, t1, d0 __LF \ /* Now let [t3;t2;t1;-] = (2^384 - p_384) * w */ \ /* We know the lowest word will cancel d0 so we don't need it */ \ - mov t1, #0xffffffff00000001; \ - umulh t1, t1, d6; \ - mov t2, #0x00000000ffffffff; \ - mul t3, t2, d6; \ - umulh t2, t2, d6; \ - adds t1, t1, t3; \ - adcs t2, t2, d6; \ - adc t3, xzr, xzr; \ + mov t1, #0xffffffff00000001 __LF \ + umulh t1, t1, d6 __LF \ + mov t2, #0x00000000ffffffff __LF \ + mul t3, t2, d6 __LF \ + umulh t2, t2, d6 __LF \ + adds t1, t1, t3 __LF \ + adcs t2, t2, d6 __LF \ + adc t3, xzr, xzr __LF \ /* Now add it, by subtracting from 2^384 * w + x */ \ - subs d1, d1, t1; \ - sbcs d2, d2, t2; \ - sbcs d3, d3, t3; \ - sbcs d4, d4, xzr; \ - sbcs d5, d5, xzr; \ + subs d1, d1, t1 __LF \ + sbcs d2, d2, t2 __LF \ + sbcs d3, d3, t3 __LF \ + sbcs d4, d4, xzr __LF \ + sbcs d5, d5, xzr __LF \ sbc d6, d6, xzr diff --git a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montsqr_p384.S similarity index 98% rename from third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_neon.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montsqr_p384.S index 9be6380eb44..8468628b1eb 100644 --- a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_neon.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montsqr_p384.S @@ -5,7 +5,7 @@ // Montgomery square, z := (x^2 / 2^384) mod p_384 // Input x[6]; output z[6] // -// extern void bignum_montsqr_p384_neon +// extern void bignum_montsqr_p384 // (uint64_t z[static 6], uint64_t x[static 6]); // // Does z := (x^2 / 2^384) mod p_384, assuming x^2 <= 2^384 * p_384, which is @@ -14,7 +14,8 @@ // Standard ARM ABI: X0 = z, X1 = x // ---------------------------------------------------------------------------- -// bignum_montsqr_p384_neon is functionally equivalent to bignum_montsqr_p384. +// bignum_montsqr_p384 is functionally equivalent to +// unopt/bignum_montsqr_p384_base. // It is written in a way that // 1. A subset of scalar multiplications in bignum_montsqr_p384 are carefully // chosen and vectorized @@ -344,12 +345,12 @@ #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p384_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p384_neon) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p384) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p384) .text .balign 4 -S2N_BN_SYMBOL(bignum_montsqr_p384_neon): +S2N_BN_SYMBOL(bignum_montsqr_p384): ldr q1, [x1] ldp x9, x2, [x1] diff --git a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montsqr_p384_alt.S similarity index 86% rename from third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montsqr_p384_alt.S index f49830d21ed..609a4bb4bf9 100644 --- a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_montsqr_p384_alt.S @@ -33,24 +33,24 @@ #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ /* Store it in d6 to make the 2^384 * w contribution already */ \ - lsl t1, d0, #32; \ - add d6, t1, d0; \ + lsl t1, d0, #32 __LF \ + add d6, t1, d0 __LF \ /* Now let [t3;t2;t1;-] = (2^384 - p_384) * w */ \ /* We know the lowest word will cancel d0 so we don't need it */ \ - mov t1, #0xffffffff00000001; \ - umulh t1, t1, d6; \ - mov t2, #0x00000000ffffffff; \ - mul t3, t2, d6; \ - umulh t2, t2, d6; \ - adds t1, t1, t3; \ - adcs t2, t2, d6; \ - adc t3, xzr, xzr; \ + mov t1, #0xffffffff00000001 __LF \ + umulh t1, t1, d6 __LF \ + mov t2, #0x00000000ffffffff __LF \ + mul t3, t2, d6 __LF \ + umulh t2, t2, d6 __LF \ + adds t1, t1, t3 __LF \ + adcs t2, t2, d6 __LF \ + adc t3, xzr, xzr __LF \ /* Now add it, by subtracting from 2^384 * w + x */ \ - subs d1, d1, t1; \ - sbcs d2, d2, t2; \ - sbcs d3, d3, t3; \ - sbcs d4, d4, xzr; \ - sbcs d5, d5, xzr; \ + subs d1, d1, t1 __LF \ + sbcs d2, d2, t2 __LF \ + sbcs d3, d3, t3 __LF \ + sbcs d4, d4, xzr __LF \ + sbcs d5, d5, xzr __LF \ sbc d6, d6, xzr #define z x0 diff --git a/third_party/s2n-bignum/arm/p384/bignum_mux_6.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mux_6.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_mux_6.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_mux_6.S diff --git a/third_party/s2n-bignum/arm/p384/bignum_neg_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_neg_p384.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_neg_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_neg_p384.S diff --git a/third_party/s2n-bignum/arm/p384/bignum_nonzero_6.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_nonzero_6.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_nonzero_6.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_nonzero_6.S diff --git a/third_party/s2n-bignum/arm/p384/bignum_optneg_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_optneg_p384.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_optneg_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_optneg_p384.S diff --git a/third_party/s2n-bignum/arm/p384/bignum_sub_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_sub_p384.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_sub_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_sub_p384.S diff --git a/third_party/s2n-bignum/arm/p384/bignum_tomont_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_tomont_p384.S similarity index 60% rename from third_party/s2n-bignum/arm/p384/bignum_tomont_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_tomont_p384.S index c666f5e78fc..c371505bc77 100644 --- a/third_party/s2n-bignum/arm/p384/bignum_tomont_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_tomont_p384.S @@ -27,38 +27,38 @@ #define modstep_p384(d6,d5,d4,d3,d2,d1,d0, t1,t2,t3) \ /* Initial quotient approximation q = min (h + 1) (2^64 - 1) */ \ - adds d6, d6, #1; \ - csetm t3, cs; \ - add d6, d6, t3; \ - orn t3, xzr, t3; \ - sub t2, d6, #1; \ - sub t1, xzr, d6; \ + adds d6, d6, #1 __LF \ + csetm t3, cs __LF \ + add d6, d6, t3 __LF \ + orn t3, xzr, t3 __LF \ + sub t2, d6, #1 __LF \ + sub t1, xzr, d6 __LF \ /* Correction term [d6;t2;t1;d0] = q * (2^384 - p_384) */ \ - lsl d0, t1, #32; \ - extr t1, t2, t1, #32; \ - lsr t2, t2, #32; \ - adds d0, d0, d6; \ - adcs t1, t1, xzr; \ - adcs t2, t2, d6; \ - adc d6, xzr, xzr; \ + lsl d0, t1, #32 __LF \ + extr t1, t2, t1, #32 __LF \ + lsr t2, t2, #32 __LF \ + adds d0, d0, d6 __LF \ + adcs t1, t1, xzr __LF \ + adcs t2, t2, d6 __LF \ + adc d6, xzr, xzr __LF \ /* Addition to the initial value */ \ - adds d1, d1, t1; \ - adcs d2, d2, t2; \ - adcs d3, d3, d6; \ - adcs d4, d4, xzr; \ - adcs d5, d5, xzr; \ - adc t3, t3, xzr; \ + adds d1, d1, t1 __LF \ + adcs d2, d2, t2 __LF \ + adcs d3, d3, d6 __LF \ + adcs d4, d4, xzr __LF \ + adcs d5, d5, xzr __LF \ + adc t3, t3, xzr __LF \ /* Use net top of the 7-word answer in t3 for masked correction */ \ - mov t1, #0x00000000ffffffff; \ - and t1, t1, t3; \ - adds d0, d0, t1; \ - eor t1, t1, t3; \ - adcs d1, d1, t1; \ - mov t1, #0xfffffffffffffffe; \ - and t1, t1, t3; \ - adcs d2, d2, t1; \ - adcs d3, d3, t3; \ - adcs d4, d4, t3; \ + mov t1, #0x00000000ffffffff __LF \ + and t1, t1, t3 __LF \ + adds d0, d0, t1 __LF \ + eor t1, t1, t3 __LF \ + adcs d1, d1, t1 __LF \ + mov t1, #0xfffffffffffffffe __LF \ + and t1, t1, t3 __LF \ + adcs d2, d2, t1 __LF \ + adcs d3, d3, t3 __LF \ + adcs d4, d4, t3 __LF \ adc d5, d5, t3 S2N_BN_SYMBOL(bignum_tomont_p384): diff --git a/third_party/s2n-bignum/arm/p384/bignum_triple_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_triple_p384.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/bignum_triple_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/bignum_triple_p384.S diff --git a/third_party/s2n-bignum/arm/p384/p384_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjadd.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/p384_montjadd.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjadd.S diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjadd_alt.S new file mode 100644 index 00000000000..e5ccc45833f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjadd_alt.S @@ -0,0 +1,993 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjadd_alt +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjadd_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x24 +#define input_x x25 +#define input_y x26 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + mul x12, x3, x5 __LF \ + umulh x13, x3, x5 __LF \ + mul x11, x3, x6 __LF \ + umulh x14, x3, x6 __LF \ + adds x13, x13, x11 __LF \ + ldp x7, x8, [P2+16] __LF \ + mul x11, x3, x7 __LF \ + umulh x15, x3, x7 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x8 __LF \ + umulh x16, x3, x8 __LF \ + adcs x15, x15, x11 __LF \ + ldp x9, x10, [P2+32] __LF \ + mul x11, x3, x9 __LF \ + umulh x17, x3, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x19, x3, x10 __LF \ + adcs x17, x17, x11 __LF \ + adc x19, x19, xzr __LF \ + mul x11, x4, x5 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x6 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x7 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x17, x17, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x19, x19, x11 __LF \ + cset x20, cs __LF \ + umulh x11, x4, x5 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x6 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x7 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x17, x17, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x19, x19, x11 __LF \ + umulh x11, x4, x10 __LF \ + adc x20, x20, x11 __LF \ + ldp x3, x4, [P1+16] __LF \ + mul x11, x3, x5 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x3, x6 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x3, x7 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x3, x8 __LF \ + adcs x17, x17, x11 __LF \ + mul x11, x3, x9 __LF \ + adcs x19, x19, x11 __LF \ + mul x11, x3, x10 __LF \ + adcs x20, x20, x11 __LF \ + cset x21, cs __LF \ + umulh x11, x3, x5 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x3, x6 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x3, x7 __LF \ + adcs x17, x17, x11 __LF \ + umulh x11, x3, x8 __LF \ + adcs x19, x19, x11 __LF \ + umulh x11, x3, x9 __LF \ + adcs x20, x20, x11 __LF \ + umulh x11, x3, x10 __LF \ + adc x21, x21, x11 __LF \ + mul x11, x4, x5 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x4, x6 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x4, x7 __LF \ + adcs x17, x17, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x19, x19, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x20, x20, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x21, x21, x11 __LF \ + cset x22, cs __LF \ + umulh x11, x4, x5 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x4, x6 __LF \ + adcs x17, x17, x11 __LF \ + umulh x11, x4, x7 __LF \ + adcs x19, x19, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x20, x20, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x21, x21, x11 __LF \ + umulh x11, x4, x10 __LF \ + adc x22, x22, x11 __LF \ + ldp x3, x4, [P1+32] __LF \ + mul x11, x3, x5 __LF \ + adds x16, x16, x11 __LF \ + mul x11, x3, x6 __LF \ + adcs x17, x17, x11 __LF \ + mul x11, x3, x7 __LF \ + adcs x19, x19, x11 __LF \ + mul x11, x3, x8 __LF \ + adcs x20, x20, x11 __LF \ + mul x11, x3, x9 __LF \ + adcs x21, x21, x11 __LF \ + mul x11, x3, x10 __LF \ + adcs x22, x22, x11 __LF \ + cset x2, cs __LF \ + umulh x11, x3, x5 __LF \ + adds x17, x17, x11 __LF \ + umulh x11, x3, x6 __LF \ + adcs x19, x19, x11 __LF \ + umulh x11, x3, x7 __LF \ + adcs x20, x20, x11 __LF \ + umulh x11, x3, x8 __LF \ + adcs x21, x21, x11 __LF \ + umulh x11, x3, x9 __LF \ + adcs x22, x22, x11 __LF \ + umulh x11, x3, x10 __LF \ + adc x2, x2, x11 __LF \ + mul x11, x4, x5 __LF \ + adds x17, x17, x11 __LF \ + mul x11, x4, x6 __LF \ + adcs x19, x19, x11 __LF \ + mul x11, x4, x7 __LF \ + adcs x20, x20, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x21, x21, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x22, x22, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x2, x2, x11 __LF \ + cset x1, cs __LF \ + umulh x11, x4, x5 __LF \ + adds x19, x19, x11 __LF \ + umulh x11, x4, x6 __LF \ + adcs x20, x20, x11 __LF \ + umulh x11, x4, x7 __LF \ + adcs x21, x21, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x22, x22, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x2, x2, x11 __LF \ + umulh x11, x4, x10 __LF \ + adc x1, x1, x11 __LF \ + lsl x7, x12, #32 __LF \ + add x12, x7, x12 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x12 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x12 __LF \ + umulh x6, x6, x12 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x12 __LF \ + adc x5, xzr, xzr __LF \ + subs x13, x13, x7 __LF \ + sbcs x14, x14, x6 __LF \ + sbcs x15, x15, x5 __LF \ + sbcs x16, x16, xzr __LF \ + sbcs x17, x17, xzr __LF \ + sbc x12, x12, xzr __LF \ + lsl x7, x13, #32 __LF \ + add x13, x7, x13 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x13 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x13 __LF \ + umulh x6, x6, x13 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x13 __LF \ + adc x5, xzr, xzr __LF \ + subs x14, x14, x7 __LF \ + sbcs x15, x15, x6 __LF \ + sbcs x16, x16, x5 __LF \ + sbcs x17, x17, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, xzr __LF \ + lsl x7, x14, #32 __LF \ + add x14, x7, x14 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x14 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x14 __LF \ + umulh x6, x6, x14 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x14 __LF \ + adc x5, xzr, xzr __LF \ + subs x15, x15, x7 __LF \ + sbcs x16, x16, x6 __LF \ + sbcs x17, x17, x5 __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbc x14, x14, xzr __LF \ + lsl x7, x15, #32 __LF \ + add x15, x7, x15 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x15 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x15 __LF \ + umulh x6, x6, x15 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x15 __LF \ + adc x5, xzr, xzr __LF \ + subs x16, x16, x7 __LF \ + sbcs x17, x17, x6 __LF \ + sbcs x12, x12, x5 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + lsl x7, x16, #32 __LF \ + add x16, x7, x16 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x16 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x16 __LF \ + umulh x6, x6, x16 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x16 __LF \ + adc x5, xzr, xzr __LF \ + subs x17, x17, x7 __LF \ + sbcs x12, x12, x6 __LF \ + sbcs x13, x13, x5 __LF \ + sbcs x14, x14, xzr __LF \ + sbcs x15, x15, xzr __LF \ + sbc x16, x16, xzr __LF \ + lsl x7, x17, #32 __LF \ + add x17, x7, x17 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x17 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x17 __LF \ + umulh x6, x6, x17 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x17 __LF \ + adc x5, xzr, xzr __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, x6 __LF \ + sbcs x14, x14, x5 __LF \ + sbcs x15, x15, xzr __LF \ + sbcs x16, x16, xzr __LF \ + sbc x17, x17, xzr __LF \ + adds x12, x12, x19 __LF \ + adcs x13, x13, x20 __LF \ + adcs x14, x14, x21 __LF \ + adcs x15, x15, x22 __LF \ + adcs x16, x16, x2 __LF \ + adcs x17, x17, x1 __LF \ + adc x10, xzr, xzr __LF \ + mov x11, #0xffffffff00000001 __LF \ + adds x19, x12, x11 __LF \ + mov x11, #0xffffffff __LF \ + adcs x20, x13, x11 __LF \ + mov x11, #0x1 __LF \ + adcs x21, x14, x11 __LF \ + adcs x22, x15, xzr __LF \ + adcs x2, x16, xzr __LF \ + adcs x1, x17, xzr __LF \ + adcs x10, x10, xzr __LF \ + csel x12, x12, x19, eq __LF \ + csel x13, x13, x20, eq __LF \ + csel x14, x14, x21, eq __LF \ + csel x15, x15, x22, eq __LF \ + csel x16, x16, x2, eq __LF \ + csel x17, x17, x1, eq __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] __LF \ + stp x16, x17, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x8, x2, x4 __LF \ + adds x10, x10, x8 __LF \ + mul x11, x2, x5 __LF \ + mul x8, x3, x4 __LF \ + adcs x11, x11, x8 __LF \ + umulh x12, x2, x5 __LF \ + mul x8, x3, x5 __LF \ + adcs x12, x12, x8 __LF \ + ldp x6, x7, [P1+32] __LF \ + mul x13, x2, x7 __LF \ + mul x8, x3, x6 __LF \ + adcs x13, x13, x8 __LF \ + umulh x14, x2, x7 __LF \ + mul x8, x3, x7 __LF \ + adcs x14, x14, x8 __LF \ + mul x15, x5, x6 __LF \ + adcs x15, x15, xzr __LF \ + umulh x16, x5, x6 __LF \ + adc x16, x16, xzr __LF \ + umulh x8, x2, x4 __LF \ + adds x11, x11, x8 __LF \ + umulh x8, x3, x4 __LF \ + adcs x12, x12, x8 __LF \ + umulh x8, x3, x5 __LF \ + adcs x13, x13, x8 __LF \ + umulh x8, x3, x6 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x3, x7 __LF \ + adcs x15, x15, x8 __LF \ + adc x16, x16, xzr __LF \ + mul x8, x2, x6 __LF \ + adds x12, x12, x8 __LF \ + mul x8, x4, x5 __LF \ + adcs x13, x13, x8 __LF \ + mul x8, x4, x6 __LF \ + adcs x14, x14, x8 __LF \ + mul x8, x4, x7 __LF \ + adcs x15, x15, x8 __LF \ + mul x8, x5, x7 __LF \ + adcs x16, x16, x8 __LF \ + mul x17, x6, x7 __LF \ + adcs x17, x17, xzr __LF \ + umulh x19, x6, x7 __LF \ + adc x19, x19, xzr __LF \ + umulh x8, x2, x6 __LF \ + adds x13, x13, x8 __LF \ + umulh x8, x4, x5 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x4, x6 __LF \ + adcs x15, x15, x8 __LF \ + umulh x8, x4, x7 __LF \ + adcs x16, x16, x8 __LF \ + umulh x8, x5, x7 __LF \ + adcs x17, x17, x8 __LF \ + adc x19, x19, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + adcs x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adcs x17, x17, x17 __LF \ + adcs x19, x19, x19 __LF \ + cset x20, hs __LF \ + umulh x8, x2, x2 __LF \ + mul x2, x2, x2 __LF \ + adds x9, x9, x8 __LF \ + mul x8, x3, x3 __LF \ + adcs x10, x10, x8 __LF \ + umulh x8, x3, x3 __LF \ + adcs x11, x11, x8 __LF \ + mul x8, x4, x4 __LF \ + adcs x12, x12, x8 __LF \ + umulh x8, x4, x4 __LF \ + adcs x13, x13, x8 __LF \ + mul x8, x5, x5 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x5, x5 __LF \ + adcs x15, x15, x8 __LF \ + mul x8, x6, x6 __LF \ + adcs x16, x16, x8 __LF \ + umulh x8, x6, x6 __LF \ + adcs x17, x17, x8 __LF \ + mul x8, x7, x7 __LF \ + adcs x19, x19, x8 __LF \ + umulh x8, x7, x7 __LF \ + adc x20, x20, x8 __LF \ + lsl x5, x2, #32 __LF \ + add x2, x5, x2 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x2 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x2 __LF \ + umulh x4, x4, x2 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x2 __LF \ + adc x3, xzr, xzr __LF \ + subs x9, x9, x5 __LF \ + sbcs x10, x10, x4 __LF \ + sbcs x11, x11, x3 __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbc x2, x2, xzr __LF \ + lsl x5, x9, #32 __LF \ + add x9, x5, x9 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x9 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x9 __LF \ + umulh x4, x4, x9 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x9 __LF \ + adc x3, xzr, xzr __LF \ + subs x10, x10, x5 __LF \ + sbcs x11, x11, x4 __LF \ + sbcs x12, x12, x3 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x2, x2, xzr __LF \ + sbc x9, x9, xzr __LF \ + lsl x5, x10, #32 __LF \ + add x10, x5, x10 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x10 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x10 __LF \ + umulh x4, x4, x10 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x10 __LF \ + adc x3, xzr, xzr __LF \ + subs x11, x11, x5 __LF \ + sbcs x12, x12, x4 __LF \ + sbcs x13, x13, x3 __LF \ + sbcs x2, x2, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + lsl x5, x11, #32 __LF \ + add x11, x5, x11 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x11 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x11 __LF \ + umulh x4, x4, x11 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x11 __LF \ + adc x3, xzr, xzr __LF \ + subs x12, x12, x5 __LF \ + sbcs x13, x13, x4 __LF \ + sbcs x2, x2, x3 __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbc x11, x11, xzr __LF \ + lsl x5, x12, #32 __LF \ + add x12, x5, x12 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x12 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x12 __LF \ + umulh x4, x4, x12 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x12 __LF \ + adc x3, xzr, xzr __LF \ + subs x13, x13, x5 __LF \ + sbcs x2, x2, x4 __LF \ + sbcs x9, x9, x3 __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbc x12, x12, xzr __LF \ + lsl x5, x13, #32 __LF \ + add x13, x5, x13 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x13 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x13 __LF \ + umulh x4, x4, x13 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x13 __LF \ + adc x3, xzr, xzr __LF \ + subs x2, x2, x5 __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, xzr __LF \ + adds x2, x2, x14 __LF \ + adcs x9, x9, x15 __LF \ + adcs x10, x10, x16 __LF \ + adcs x11, x11, x17 __LF \ + adcs x12, x12, x19 __LF \ + adcs x13, x13, x20 __LF \ + adc x6, xzr, xzr __LF \ + mov x8, #-4294967295 __LF \ + adds x14, x2, x8 __LF \ + mov x8, #4294967295 __LF \ + adcs x15, x9, x8 __LF \ + mov x8, #1 __LF \ + adcs x16, x10, x8 __LF \ + adcs x17, x11, xzr __LF \ + adcs x19, x12, xzr __LF \ + adcs x20, x13, xzr __LF \ + adcs x6, x6, xzr __LF \ + csel x2, x2, x14, eq __LF \ + csel x9, x9, x15, eq __LF \ + csel x10, x10, x16, eq __LF \ + csel x11, x11, x17, eq __LF \ + csel x12, x12, x19, eq __LF \ + csel x13, x13, x20, eq __LF \ + stp x2, x9, [P0] __LF \ + stp x10, x11, [P0+16] __LF \ + stp x12, x13, [P0+32] + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). In +// fact, with the Karatsuba-based Montgomery mul here, we don't even +// *need* the restriction that the other argument is reduced. + +#define amontsqr_p384(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x8, x2, x4 __LF \ + adds x10, x10, x8 __LF \ + mul x11, x2, x5 __LF \ + mul x8, x3, x4 __LF \ + adcs x11, x11, x8 __LF \ + umulh x12, x2, x5 __LF \ + mul x8, x3, x5 __LF \ + adcs x12, x12, x8 __LF \ + ldp x6, x7, [P1+32] __LF \ + mul x13, x2, x7 __LF \ + mul x8, x3, x6 __LF \ + adcs x13, x13, x8 __LF \ + umulh x14, x2, x7 __LF \ + mul x8, x3, x7 __LF \ + adcs x14, x14, x8 __LF \ + mul x15, x5, x6 __LF \ + adcs x15, x15, xzr __LF \ + umulh x16, x5, x6 __LF \ + adc x16, x16, xzr __LF \ + umulh x8, x2, x4 __LF \ + adds x11, x11, x8 __LF \ + umulh x8, x3, x4 __LF \ + adcs x12, x12, x8 __LF \ + umulh x8, x3, x5 __LF \ + adcs x13, x13, x8 __LF \ + umulh x8, x3, x6 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x3, x7 __LF \ + adcs x15, x15, x8 __LF \ + adc x16, x16, xzr __LF \ + mul x8, x2, x6 __LF \ + adds x12, x12, x8 __LF \ + mul x8, x4, x5 __LF \ + adcs x13, x13, x8 __LF \ + mul x8, x4, x6 __LF \ + adcs x14, x14, x8 __LF \ + mul x8, x4, x7 __LF \ + adcs x15, x15, x8 __LF \ + mul x8, x5, x7 __LF \ + adcs x16, x16, x8 __LF \ + mul x17, x6, x7 __LF \ + adcs x17, x17, xzr __LF \ + umulh x19, x6, x7 __LF \ + adc x19, x19, xzr __LF \ + umulh x8, x2, x6 __LF \ + adds x13, x13, x8 __LF \ + umulh x8, x4, x5 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x4, x6 __LF \ + adcs x15, x15, x8 __LF \ + umulh x8, x4, x7 __LF \ + adcs x16, x16, x8 __LF \ + umulh x8, x5, x7 __LF \ + adcs x17, x17, x8 __LF \ + adc x19, x19, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + adcs x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adcs x17, x17, x17 __LF \ + adcs x19, x19, x19 __LF \ + cset x20, hs __LF \ + umulh x8, x2, x2 __LF \ + mul x2, x2, x2 __LF \ + adds x9, x9, x8 __LF \ + mul x8, x3, x3 __LF \ + adcs x10, x10, x8 __LF \ + umulh x8, x3, x3 __LF \ + adcs x11, x11, x8 __LF \ + mul x8, x4, x4 __LF \ + adcs x12, x12, x8 __LF \ + umulh x8, x4, x4 __LF \ + adcs x13, x13, x8 __LF \ + mul x8, x5, x5 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x5, x5 __LF \ + adcs x15, x15, x8 __LF \ + mul x8, x6, x6 __LF \ + adcs x16, x16, x8 __LF \ + umulh x8, x6, x6 __LF \ + adcs x17, x17, x8 __LF \ + mul x8, x7, x7 __LF \ + adcs x19, x19, x8 __LF \ + umulh x8, x7, x7 __LF \ + adc x20, x20, x8 __LF \ + lsl x5, x2, #32 __LF \ + add x2, x5, x2 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x2 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x2 __LF \ + umulh x4, x4, x2 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x2 __LF \ + adc x3, xzr, xzr __LF \ + subs x9, x9, x5 __LF \ + sbcs x10, x10, x4 __LF \ + sbcs x11, x11, x3 __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbc x2, x2, xzr __LF \ + lsl x5, x9, #32 __LF \ + add x9, x5, x9 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x9 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x9 __LF \ + umulh x4, x4, x9 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x9 __LF \ + adc x3, xzr, xzr __LF \ + subs x10, x10, x5 __LF \ + sbcs x11, x11, x4 __LF \ + sbcs x12, x12, x3 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x2, x2, xzr __LF \ + sbc x9, x9, xzr __LF \ + lsl x5, x10, #32 __LF \ + add x10, x5, x10 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x10 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x10 __LF \ + umulh x4, x4, x10 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x10 __LF \ + adc x3, xzr, xzr __LF \ + subs x11, x11, x5 __LF \ + sbcs x12, x12, x4 __LF \ + sbcs x13, x13, x3 __LF \ + sbcs x2, x2, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + lsl x5, x11, #32 __LF \ + add x11, x5, x11 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x11 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x11 __LF \ + umulh x4, x4, x11 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x11 __LF \ + adc x3, xzr, xzr __LF \ + subs x12, x12, x5 __LF \ + sbcs x13, x13, x4 __LF \ + sbcs x2, x2, x3 __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbc x11, x11, xzr __LF \ + lsl x5, x12, #32 __LF \ + add x12, x5, x12 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x12 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x12 __LF \ + umulh x4, x4, x12 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x12 __LF \ + adc x3, xzr, xzr __LF \ + subs x13, x13, x5 __LF \ + sbcs x2, x2, x4 __LF \ + sbcs x9, x9, x3 __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbc x12, x12, xzr __LF \ + lsl x5, x13, #32 __LF \ + add x13, x5, x13 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x13 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x13 __LF \ + umulh x4, x4, x13 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x13 __LF \ + adc x3, xzr, xzr __LF \ + subs x2, x2, x5 __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, xzr __LF \ + adds x2, x2, x14 __LF \ + adcs x9, x9, x15 __LF \ + adcs x10, x10, x16 __LF \ + adcs x11, x11, x17 __LF \ + adcs x12, x12, x19 __LF \ + adcs x13, x13, x20 __LF \ + mov x14, #-4294967295 __LF \ + mov x15, #4294967295 __LF \ + csel x14, x14, xzr, cs __LF \ + csel x15, x15, xzr, cs __LF \ + cset x16, cs __LF \ + adds x2, x2, x14 __LF \ + adcs x9, x9, x15 __LF \ + adcs x10, x10, x16 __LF \ + adcs x11, x11, xzr __LF \ + adcs x12, x12, xzr __LF \ + adc x13, x13, xzr __LF \ + stp x2, x9, [P0] __LF \ + stp x10, x11, [P0+16] __LF \ + stp x12, x13, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + csetm x3, lo __LF \ + mov x4, #4294967295 __LF \ + and x4, x4, x3 __LF \ + adds x5, x5, x4 __LF \ + eor x4, x4, x3 __LF \ + adcs x6, x6, x4 __LF \ + mov x4, #-2 __LF \ + and x4, x4, x3 __LF \ + adcs x7, x7, x4 __LF \ + adcs x8, x8, x3 __LF \ + adcs x9, x9, x3 __LF \ + adc x10, x10, x3 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] + +S2N_BN_SYMBOL(p384_montjadd_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_p384(z1sq,z_1) + amontsqr_p384(z2sq,z_2) + + montmul_p384(y1a,z_2,y_1) + montmul_p384(y2a,z_1,y_2) + + montmul_p384(x2a,z1sq,x_2) + montmul_p384(x1a,z2sq,x_1) + montmul_p384(y2a,z1sq,y2a) + montmul_p384(y1a,z2sq,y1a) + + sub_p384(xd,x2a,x1a) + sub_p384(yd,y2a,y1a) + + amontsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + montmul_p384(zzx1,zz,x1a) + montmul_p384(zzx2,zz,x2a) + + sub_p384(resx,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + montmul_p384(xd,xd,z_1) + + sub_p384(resx,resx,zzx2) + + sub_p384(t2,zzx1,resx) + + montmul_p384(t1,t1,y1a) + montmul_p384(resz,xd,z_2) + montmul_p384(t2,yd,t2) + + sub_p384(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x20, x20, x21 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne + + ldp x6, x7, [z_2] + ldp x8, x9, [z_2+16] + ldp x10, x11, [z_2+32] + + orr x21, x6, x7 + orr x22, x8, x9 + orr x23, x10, x11 + orr x21, x21, x22 + orr x21, x21, x23 + cmp x21, xzr + cset x21, ne + + cmp x21, x20 + +// Multiplex the outputs accordingly, re-using the z's in registers + + ldp x12, x13, [resz] + csel x12, x0, x12, lo + csel x13, x1, x13, lo + csel x12, x6, x12, hi + csel x13, x7, x13, hi + ldp x14, x15, [resz+16] + csel x14, x2, x14, lo + csel x15, x3, x15, lo + csel x14, x8, x14, hi + csel x15, x9, x15, hi + ldp x16, x17, [resz+32] + csel x16, x4, x16, lo + csel x17, x5, x17, lo + csel x16, x10, x16, hi + csel x17, x11, x17, hi + + ldp x20, x21, [x_1] + ldp x0, x1, [resx] + csel x0, x20, x0, lo + csel x1, x21, x1, lo + ldp x20, x21, [x_2] + csel x0, x20, x0, hi + csel x1, x21, x1, hi + + ldp x20, x21, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x20, x2, lo + csel x3, x21, x3, lo + ldp x20, x21, [x_2+16] + csel x2, x20, x2, hi + csel x3, x21, x3, hi + + ldp x20, x21, [x_1+32] + ldp x4, x5, [resx+32] + csel x4, x20, x4, lo + csel x5, x21, x5, lo + ldp x20, x21, [x_2+32] + csel x4, x20, x4, hi + csel x5, x21, x5, hi + + ldp x20, x21, [y_1] + ldp x6, x7, [resy] + csel x6, x20, x6, lo + csel x7, x21, x7, lo + ldp x20, x21, [y_2] + csel x6, x20, x6, hi + csel x7, x21, x7, hi + + ldp x20, x21, [y_1+16] + ldp x8, x9, [resy+16] + csel x8, x20, x8, lo + csel x9, x21, x9, lo + ldp x20, x21, [y_2+16] + csel x8, x20, x8, hi + csel x9, x21, x9, hi + + ldp x20, x21, [y_1+32] + ldp x10, x11, [resy+32] + csel x10, x20, x10, lo + csel x11, x21, x11, lo + ldp x20, x21, [y_2+32] + csel x10, x20, x10, hi + csel x11, x21, x11, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [y_3] + stp x8, x9, [y_3+16] + stp x10, x11, [y_3+32] + stp x12, x13, [z_3] + stp x14, x15, [z_3+16] + stp x16, x17, [z_3+32] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p384/p384_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjdouble.S similarity index 100% rename from third_party/s2n-bignum/arm/p384/p384_montjdouble.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjdouble.S diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjdouble_alt.S new file mode 100644 index 00000000000..c8a96fbba44 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjdouble_alt.S @@ -0,0 +1,951 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjdouble_alt +// (uint64_t p3[static 18],uint64_t p1[static 18]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjdouble_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x23 +#define input_x x24 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 sp, #(NUMSIZE*0) +#define y2 sp, #(NUMSIZE*1) +#define x2p sp, #(NUMSIZE*2) +#define xy2 sp, #(NUMSIZE*3) + +#define y4 sp, #(NUMSIZE*4) +#define t2 sp, #(NUMSIZE*4) + +#define dx2 sp, #(NUMSIZE*5) +#define t1 sp, #(NUMSIZE*5) + +#define d sp, #(NUMSIZE*6) +#define x4p sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + mul x12, x3, x5 __LF \ + umulh x13, x3, x5 __LF \ + mul x11, x3, x6 __LF \ + umulh x14, x3, x6 __LF \ + adds x13, x13, x11 __LF \ + ldp x7, x8, [P2+16] __LF \ + mul x11, x3, x7 __LF \ + umulh x15, x3, x7 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x8 __LF \ + umulh x16, x3, x8 __LF \ + adcs x15, x15, x11 __LF \ + ldp x9, x10, [P2+32] __LF \ + mul x11, x3, x9 __LF \ + umulh x17, x3, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x19, x3, x10 __LF \ + adcs x17, x17, x11 __LF \ + adc x19, x19, xzr __LF \ + mul x11, x4, x5 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x6 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x7 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x17, x17, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x19, x19, x11 __LF \ + cset x20, cs __LF \ + umulh x11, x4, x5 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x6 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x7 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x17, x17, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x19, x19, x11 __LF \ + umulh x11, x4, x10 __LF \ + adc x20, x20, x11 __LF \ + ldp x3, x4, [P1+16] __LF \ + mul x11, x3, x5 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x3, x6 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x3, x7 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x3, x8 __LF \ + adcs x17, x17, x11 __LF \ + mul x11, x3, x9 __LF \ + adcs x19, x19, x11 __LF \ + mul x11, x3, x10 __LF \ + adcs x20, x20, x11 __LF \ + cset x21, cs __LF \ + umulh x11, x3, x5 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x3, x6 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x3, x7 __LF \ + adcs x17, x17, x11 __LF \ + umulh x11, x3, x8 __LF \ + adcs x19, x19, x11 __LF \ + umulh x11, x3, x9 __LF \ + adcs x20, x20, x11 __LF \ + umulh x11, x3, x10 __LF \ + adc x21, x21, x11 __LF \ + mul x11, x4, x5 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x4, x6 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x4, x7 __LF \ + adcs x17, x17, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x19, x19, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x20, x20, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x21, x21, x11 __LF \ + cset x22, cs __LF \ + umulh x11, x4, x5 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x4, x6 __LF \ + adcs x17, x17, x11 __LF \ + umulh x11, x4, x7 __LF \ + adcs x19, x19, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x20, x20, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x21, x21, x11 __LF \ + umulh x11, x4, x10 __LF \ + adc x22, x22, x11 __LF \ + ldp x3, x4, [P1+32] __LF \ + mul x11, x3, x5 __LF \ + adds x16, x16, x11 __LF \ + mul x11, x3, x6 __LF \ + adcs x17, x17, x11 __LF \ + mul x11, x3, x7 __LF \ + adcs x19, x19, x11 __LF \ + mul x11, x3, x8 __LF \ + adcs x20, x20, x11 __LF \ + mul x11, x3, x9 __LF \ + adcs x21, x21, x11 __LF \ + mul x11, x3, x10 __LF \ + adcs x22, x22, x11 __LF \ + cset x2, cs __LF \ + umulh x11, x3, x5 __LF \ + adds x17, x17, x11 __LF \ + umulh x11, x3, x6 __LF \ + adcs x19, x19, x11 __LF \ + umulh x11, x3, x7 __LF \ + adcs x20, x20, x11 __LF \ + umulh x11, x3, x8 __LF \ + adcs x21, x21, x11 __LF \ + umulh x11, x3, x9 __LF \ + adcs x22, x22, x11 __LF \ + umulh x11, x3, x10 __LF \ + adc x2, x2, x11 __LF \ + mul x11, x4, x5 __LF \ + adds x17, x17, x11 __LF \ + mul x11, x4, x6 __LF \ + adcs x19, x19, x11 __LF \ + mul x11, x4, x7 __LF \ + adcs x20, x20, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x21, x21, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x22, x22, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x2, x2, x11 __LF \ + cset x1, cs __LF \ + umulh x11, x4, x5 __LF \ + adds x19, x19, x11 __LF \ + umulh x11, x4, x6 __LF \ + adcs x20, x20, x11 __LF \ + umulh x11, x4, x7 __LF \ + adcs x21, x21, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x22, x22, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x2, x2, x11 __LF \ + umulh x11, x4, x10 __LF \ + adc x1, x1, x11 __LF \ + lsl x7, x12, #32 __LF \ + add x12, x7, x12 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x12 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x12 __LF \ + umulh x6, x6, x12 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x12 __LF \ + adc x5, xzr, xzr __LF \ + subs x13, x13, x7 __LF \ + sbcs x14, x14, x6 __LF \ + sbcs x15, x15, x5 __LF \ + sbcs x16, x16, xzr __LF \ + sbcs x17, x17, xzr __LF \ + sbc x12, x12, xzr __LF \ + lsl x7, x13, #32 __LF \ + add x13, x7, x13 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x13 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x13 __LF \ + umulh x6, x6, x13 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x13 __LF \ + adc x5, xzr, xzr __LF \ + subs x14, x14, x7 __LF \ + sbcs x15, x15, x6 __LF \ + sbcs x16, x16, x5 __LF \ + sbcs x17, x17, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, xzr __LF \ + lsl x7, x14, #32 __LF \ + add x14, x7, x14 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x14 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x14 __LF \ + umulh x6, x6, x14 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x14 __LF \ + adc x5, xzr, xzr __LF \ + subs x15, x15, x7 __LF \ + sbcs x16, x16, x6 __LF \ + sbcs x17, x17, x5 __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbc x14, x14, xzr __LF \ + lsl x7, x15, #32 __LF \ + add x15, x7, x15 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x15 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x15 __LF \ + umulh x6, x6, x15 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x15 __LF \ + adc x5, xzr, xzr __LF \ + subs x16, x16, x7 __LF \ + sbcs x17, x17, x6 __LF \ + sbcs x12, x12, x5 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + lsl x7, x16, #32 __LF \ + add x16, x7, x16 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x16 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x16 __LF \ + umulh x6, x6, x16 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x16 __LF \ + adc x5, xzr, xzr __LF \ + subs x17, x17, x7 __LF \ + sbcs x12, x12, x6 __LF \ + sbcs x13, x13, x5 __LF \ + sbcs x14, x14, xzr __LF \ + sbcs x15, x15, xzr __LF \ + sbc x16, x16, xzr __LF \ + lsl x7, x17, #32 __LF \ + add x17, x7, x17 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x17 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x17 __LF \ + umulh x6, x6, x17 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x17 __LF \ + adc x5, xzr, xzr __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, x6 __LF \ + sbcs x14, x14, x5 __LF \ + sbcs x15, x15, xzr __LF \ + sbcs x16, x16, xzr __LF \ + sbc x17, x17, xzr __LF \ + adds x12, x12, x19 __LF \ + adcs x13, x13, x20 __LF \ + adcs x14, x14, x21 __LF \ + adcs x15, x15, x22 __LF \ + adcs x16, x16, x2 __LF \ + adcs x17, x17, x1 __LF \ + adc x10, xzr, xzr __LF \ + mov x11, #0xffffffff00000001 __LF \ + adds x19, x12, x11 __LF \ + mov x11, #0xffffffff __LF \ + adcs x20, x13, x11 __LF \ + mov x11, #0x1 __LF \ + adcs x21, x14, x11 __LF \ + adcs x22, x15, xzr __LF \ + adcs x2, x16, xzr __LF \ + adcs x1, x17, xzr __LF \ + adcs x10, x10, xzr __LF \ + csel x12, x12, x19, eq __LF \ + csel x13, x13, x20, eq __LF \ + csel x14, x14, x21, eq __LF \ + csel x15, x15, x22, eq __LF \ + csel x16, x16, x2, eq __LF \ + csel x17, x17, x1, eq __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] __LF \ + stp x16, x17, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x8, x2, x4 __LF \ + adds x10, x10, x8 __LF \ + mul x11, x2, x5 __LF \ + mul x8, x3, x4 __LF \ + adcs x11, x11, x8 __LF \ + umulh x12, x2, x5 __LF \ + mul x8, x3, x5 __LF \ + adcs x12, x12, x8 __LF \ + ldp x6, x7, [P1+32] __LF \ + mul x13, x2, x7 __LF \ + mul x8, x3, x6 __LF \ + adcs x13, x13, x8 __LF \ + umulh x14, x2, x7 __LF \ + mul x8, x3, x7 __LF \ + adcs x14, x14, x8 __LF \ + mul x15, x5, x6 __LF \ + adcs x15, x15, xzr __LF \ + umulh x16, x5, x6 __LF \ + adc x16, x16, xzr __LF \ + umulh x8, x2, x4 __LF \ + adds x11, x11, x8 __LF \ + umulh x8, x3, x4 __LF \ + adcs x12, x12, x8 __LF \ + umulh x8, x3, x5 __LF \ + adcs x13, x13, x8 __LF \ + umulh x8, x3, x6 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x3, x7 __LF \ + adcs x15, x15, x8 __LF \ + adc x16, x16, xzr __LF \ + mul x8, x2, x6 __LF \ + adds x12, x12, x8 __LF \ + mul x8, x4, x5 __LF \ + adcs x13, x13, x8 __LF \ + mul x8, x4, x6 __LF \ + adcs x14, x14, x8 __LF \ + mul x8, x4, x7 __LF \ + adcs x15, x15, x8 __LF \ + mul x8, x5, x7 __LF \ + adcs x16, x16, x8 __LF \ + mul x17, x6, x7 __LF \ + adcs x17, x17, xzr __LF \ + umulh x19, x6, x7 __LF \ + adc x19, x19, xzr __LF \ + umulh x8, x2, x6 __LF \ + adds x13, x13, x8 __LF \ + umulh x8, x4, x5 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x4, x6 __LF \ + adcs x15, x15, x8 __LF \ + umulh x8, x4, x7 __LF \ + adcs x16, x16, x8 __LF \ + umulh x8, x5, x7 __LF \ + adcs x17, x17, x8 __LF \ + adc x19, x19, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + adcs x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adcs x17, x17, x17 __LF \ + adcs x19, x19, x19 __LF \ + cset x20, hs __LF \ + umulh x8, x2, x2 __LF \ + mul x2, x2, x2 __LF \ + adds x9, x9, x8 __LF \ + mul x8, x3, x3 __LF \ + adcs x10, x10, x8 __LF \ + umulh x8, x3, x3 __LF \ + adcs x11, x11, x8 __LF \ + mul x8, x4, x4 __LF \ + adcs x12, x12, x8 __LF \ + umulh x8, x4, x4 __LF \ + adcs x13, x13, x8 __LF \ + mul x8, x5, x5 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x5, x5 __LF \ + adcs x15, x15, x8 __LF \ + mul x8, x6, x6 __LF \ + adcs x16, x16, x8 __LF \ + umulh x8, x6, x6 __LF \ + adcs x17, x17, x8 __LF \ + mul x8, x7, x7 __LF \ + adcs x19, x19, x8 __LF \ + umulh x8, x7, x7 __LF \ + adc x20, x20, x8 __LF \ + lsl x5, x2, #32 __LF \ + add x2, x5, x2 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x2 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x2 __LF \ + umulh x4, x4, x2 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x2 __LF \ + adc x3, xzr, xzr __LF \ + subs x9, x9, x5 __LF \ + sbcs x10, x10, x4 __LF \ + sbcs x11, x11, x3 __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbc x2, x2, xzr __LF \ + lsl x5, x9, #32 __LF \ + add x9, x5, x9 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x9 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x9 __LF \ + umulh x4, x4, x9 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x9 __LF \ + adc x3, xzr, xzr __LF \ + subs x10, x10, x5 __LF \ + sbcs x11, x11, x4 __LF \ + sbcs x12, x12, x3 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x2, x2, xzr __LF \ + sbc x9, x9, xzr __LF \ + lsl x5, x10, #32 __LF \ + add x10, x5, x10 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x10 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x10 __LF \ + umulh x4, x4, x10 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x10 __LF \ + adc x3, xzr, xzr __LF \ + subs x11, x11, x5 __LF \ + sbcs x12, x12, x4 __LF \ + sbcs x13, x13, x3 __LF \ + sbcs x2, x2, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + lsl x5, x11, #32 __LF \ + add x11, x5, x11 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x11 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x11 __LF \ + umulh x4, x4, x11 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x11 __LF \ + adc x3, xzr, xzr __LF \ + subs x12, x12, x5 __LF \ + sbcs x13, x13, x4 __LF \ + sbcs x2, x2, x3 __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbc x11, x11, xzr __LF \ + lsl x5, x12, #32 __LF \ + add x12, x5, x12 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x12 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x12 __LF \ + umulh x4, x4, x12 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x12 __LF \ + adc x3, xzr, xzr __LF \ + subs x13, x13, x5 __LF \ + sbcs x2, x2, x4 __LF \ + sbcs x9, x9, x3 __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbc x12, x12, xzr __LF \ + lsl x5, x13, #32 __LF \ + add x13, x5, x13 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x13 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x13 __LF \ + umulh x4, x4, x13 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x13 __LF \ + adc x3, xzr, xzr __LF \ + subs x2, x2, x5 __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, xzr __LF \ + adds x2, x2, x14 __LF \ + adcs x9, x9, x15 __LF \ + adcs x10, x10, x16 __LF \ + adcs x11, x11, x17 __LF \ + adcs x12, x12, x19 __LF \ + adcs x13, x13, x20 __LF \ + adc x6, xzr, xzr __LF \ + mov x8, #-4294967295 __LF \ + adds x14, x2, x8 __LF \ + mov x8, #4294967295 __LF \ + adcs x15, x9, x8 __LF \ + mov x8, #1 __LF \ + adcs x16, x10, x8 __LF \ + adcs x17, x11, xzr __LF \ + adcs x19, x12, xzr __LF \ + adcs x20, x13, xzr __LF \ + adcs x6, x6, xzr __LF \ + csel x2, x2, x14, eq __LF \ + csel x9, x9, x15, eq __LF \ + csel x10, x10, x16, eq __LF \ + csel x11, x11, x17, eq __LF \ + csel x12, x12, x19, eq __LF \ + csel x13, x13, x20, eq __LF \ + stp x2, x9, [P0] __LF \ + stp x10, x11, [P0+16] __LF \ + stp x12, x13, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + csetm x3, lo __LF \ + mov x4, #4294967295 __LF \ + and x4, x4, x3 __LF \ + adds x5, x5, x4 __LF \ + eor x4, x4, x3 __LF \ + adcs x6, x6, x4 __LF \ + mov x4, #-2 __LF \ + and x4, x4, x3 __LF \ + adcs x7, x7, x4 __LF \ + adcs x8, x8, x3 __LF \ + adcs x9, x9, x3 __LF \ + adc x10, x10, x3 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] + +// Corresponds exactly to bignum_add_p384 + +#define add_p384(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + adds x5, x5, x4 __LF \ + adcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + adcs x7, x7, x4 __LF \ + adcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x3 __LF \ + adc x3, xzr, xzr __LF \ + mov x4, #0xffffffff __LF \ + cmp x5, x4 __LF \ + mov x4, #0xffffffff00000000 __LF \ + sbcs xzr, x6, x4 __LF \ + mov x4, #0xfffffffffffffffe __LF \ + sbcs xzr, x7, x4 __LF \ + adcs xzr, x8, xzr __LF \ + adcs xzr, x9, xzr __LF \ + adcs xzr, x10, xzr __LF \ + adcs x3, x3, xzr __LF \ + csetm x3, ne __LF \ + mov x4, #0xffffffff __LF \ + and x4, x4, x3 __LF \ + subs x5, x5, x4 __LF \ + eor x4, x4, x3 __LF \ + sbcs x6, x6, x4 __LF \ + mov x4, #0xfffffffffffffffe __LF \ + and x4, x4, x3 __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + sbcs x9, x9, x3 __LF \ + sbc x10, x10, x3 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] + +// P0 = 4 * P1 - P2 + +#define cmsub41_p384(P0,P1,P2) \ + ldp x1, x2, [P1] __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P1+32] __LF \ + lsl x0, x1, #2 __LF \ + ldp x7, x8, [P2] __LF \ + subs x0, x0, x7 __LF \ + extr x1, x2, x1, #62 __LF \ + sbcs x1, x1, x8 __LF \ + ldp x7, x8, [P2+16] __LF \ + extr x2, x3, x2, #62 __LF \ + sbcs x2, x2, x7 __LF \ + extr x3, x4, x3, #62 __LF \ + sbcs x3, x3, x8 __LF \ + extr x4, x5, x4, #62 __LF \ + ldp x7, x8, [P2+32] __LF \ + sbcs x4, x4, x7 __LF \ + extr x5, x6, x5, #62 __LF \ + sbcs x5, x5, x8 __LF \ + lsr x6, x6, #62 __LF \ + adc x6, x6, xzr __LF \ + lsl x7, x6, #32 __LF \ + subs x8, x6, x7 __LF \ + sbc x7, x7, xzr __LF \ + adds x0, x0, x8 __LF \ + adcs x1, x1, x7 __LF \ + adcs x2, x2, x6 __LF \ + adcs x3, x3, xzr __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + csetm x8, cc __LF \ + mov x9, #0xffffffff __LF \ + and x9, x9, x8 __LF \ + adds x0, x0, x9 __LF \ + eor x9, x9, x8 __LF \ + adcs x1, x1, x9 __LF \ + mov x9, #0xfffffffffffffffe __LF \ + and x9, x9, x8 __LF \ + adcs x2, x2, x9 __LF \ + adcs x3, x3, x8 __LF \ + adcs x4, x4, x8 __LF \ + adc x5, x5, x8 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] __LF \ + stp x4, x5, [P0+32] + +// P0 = C * P1 - D * P2 + +#define cmsub_p384(P0,C,P1,D,P2) \ + ldp x0, x1, [P2] __LF \ + mov x6, #0x00000000ffffffff __LF \ + subs x6, x6, x0 __LF \ + mov x7, #0xffffffff00000000 __LF \ + sbcs x7, x7, x1 __LF \ + ldp x0, x1, [P2+16] __LF \ + mov x8, #0xfffffffffffffffe __LF \ + sbcs x8, x8, x0 __LF \ + mov x13, #0xffffffffffffffff __LF \ + sbcs x9, x13, x1 __LF \ + ldp x0, x1, [P2+32] __LF \ + sbcs x10, x13, x0 __LF \ + sbc x11, x13, x1 __LF \ + mov x12, D __LF \ + mul x0, x12, x6 __LF \ + mul x1, x12, x7 __LF \ + mul x2, x12, x8 __LF \ + mul x3, x12, x9 __LF \ + mul x4, x12, x10 __LF \ + mul x5, x12, x11 __LF \ + umulh x6, x12, x6 __LF \ + umulh x7, x12, x7 __LF \ + umulh x8, x12, x8 __LF \ + umulh x9, x12, x9 __LF \ + umulh x10, x12, x10 __LF \ + umulh x12, x12, x11 __LF \ + adds x1, x1, x6 __LF \ + adcs x2, x2, x7 __LF \ + adcs x3, x3, x8 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + mov x6, #1 __LF \ + adc x6, x12, x6 __LF \ + ldp x8, x9, [P1] __LF \ + ldp x10, x11, [P1+16] __LF \ + ldp x12, x13, [P1+32] __LF \ + mov x14, C __LF \ + mul x15, x14, x8 __LF \ + umulh x8, x14, x8 __LF \ + adds x0, x0, x15 __LF \ + mul x15, x14, x9 __LF \ + umulh x9, x14, x9 __LF \ + adcs x1, x1, x15 __LF \ + mul x15, x14, x10 __LF \ + umulh x10, x14, x10 __LF \ + adcs x2, x2, x15 __LF \ + mul x15, x14, x11 __LF \ + umulh x11, x14, x11 __LF \ + adcs x3, x3, x15 __LF \ + mul x15, x14, x12 __LF \ + umulh x12, x14, x12 __LF \ + adcs x4, x4, x15 __LF \ + mul x15, x14, x13 __LF \ + umulh x13, x14, x13 __LF \ + adcs x5, x5, x15 __LF \ + adc x6, x6, xzr __LF \ + adds x1, x1, x8 __LF \ + adcs x2, x2, x9 __LF \ + adcs x3, x3, x10 __LF \ + adcs x4, x4, x11 __LF \ + adcs x5, x5, x12 __LF \ + adcs x6, x6, x13 __LF \ + lsl x7, x6, #32 __LF \ + subs x8, x6, x7 __LF \ + sbc x7, x7, xzr __LF \ + adds x0, x0, x8 __LF \ + adcs x1, x1, x7 __LF \ + adcs x2, x2, x6 __LF \ + adcs x3, x3, xzr __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + csetm x6, cc __LF \ + mov x7, #0xffffffff __LF \ + and x7, x7, x6 __LF \ + adds x0, x0, x7 __LF \ + eor x7, x7, x6 __LF \ + adcs x1, x1, x7 __LF \ + mov x7, #0xfffffffffffffffe __LF \ + and x7, x7, x6 __LF \ + adcs x2, x2, x7 __LF \ + adcs x3, x3, x6 __LF \ + adcs x4, x4, x6 __LF \ + adc x5, x5, x6 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] __LF \ + stp x4, x5, [P0+32] + +// A weak version of add that only guarantees sum in 6 digits + +#define weakadd_p384(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + adds x5, x5, x4 __LF \ + adcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + adcs x7, x7, x4 __LF \ + adcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x3 __LF \ + csetm x3, cs __LF \ + mov x4, #0xffffffff __LF \ + and x4, x4, x3 __LF \ + subs x5, x5, x4 __LF \ + eor x4, x4, x3 __LF \ + sbcs x6, x6, x4 __LF \ + mov x4, #0xfffffffffffffffe __LF \ + and x4, x4, x3 __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + sbcs x9, x9, x3 __LF \ + sbc x10, x10, x3 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] + +// P0 = 3 * P1 - 8 * P2 + +#define cmsub38_p384(P0,P1,P2) \ + ldp x0, x1, [P2] __LF \ + mov x6, #0x00000000ffffffff __LF \ + subs x6, x6, x0 __LF \ + mov x7, #0xffffffff00000000 __LF \ + sbcs x7, x7, x1 __LF \ + ldp x0, x1, [P2+16] __LF \ + mov x8, #0xfffffffffffffffe __LF \ + sbcs x8, x8, x0 __LF \ + mov x13, #0xffffffffffffffff __LF \ + sbcs x9, x13, x1 __LF \ + ldp x0, x1, [P2+32] __LF \ + sbcs x10, x13, x0 __LF \ + sbc x11, x13, x1 __LF \ + lsl x0, x6, #3 __LF \ + extr x1, x7, x6, #61 __LF \ + extr x2, x8, x7, #61 __LF \ + extr x3, x9, x8, #61 __LF \ + extr x4, x10, x9, #61 __LF \ + extr x5, x11, x10, #61 __LF \ + lsr x6, x11, #61 __LF \ + add x6, x6, #1 __LF \ + ldp x8, x9, [P1] __LF \ + ldp x10, x11, [P1+16] __LF \ + ldp x12, x13, [P1+32] __LF \ + mov x14, 3 __LF \ + mul x15, x14, x8 __LF \ + umulh x8, x14, x8 __LF \ + adds x0, x0, x15 __LF \ + mul x15, x14, x9 __LF \ + umulh x9, x14, x9 __LF \ + adcs x1, x1, x15 __LF \ + mul x15, x14, x10 __LF \ + umulh x10, x14, x10 __LF \ + adcs x2, x2, x15 __LF \ + mul x15, x14, x11 __LF \ + umulh x11, x14, x11 __LF \ + adcs x3, x3, x15 __LF \ + mul x15, x14, x12 __LF \ + umulh x12, x14, x12 __LF \ + adcs x4, x4, x15 __LF \ + mul x15, x14, x13 __LF \ + umulh x13, x14, x13 __LF \ + adcs x5, x5, x15 __LF \ + adc x6, x6, xzr __LF \ + adds x1, x1, x8 __LF \ + adcs x2, x2, x9 __LF \ + adcs x3, x3, x10 __LF \ + adcs x4, x4, x11 __LF \ + adcs x5, x5, x12 __LF \ + adcs x6, x6, x13 __LF \ + lsl x7, x6, #32 __LF \ + subs x8, x6, x7 __LF \ + sbc x7, x7, xzr __LF \ + adds x0, x0, x8 __LF \ + adcs x1, x1, x7 __LF \ + adcs x2, x2, x6 __LF \ + adcs x3, x3, xzr __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + csetm x6, cc __LF \ + mov x7, #0xffffffff __LF \ + and x7, x7, x6 __LF \ + adds x0, x0, x7 __LF \ + eor x7, x7, x6 __LF \ + adcs x1, x1, x7 __LF \ + mov x7, #0xfffffffffffffffe __LF \ + and x7, x7, x6 __LF \ + adcs x2, x2, x7 __LF \ + adcs x3, x3, x6 __LF \ + adcs x4, x4, x6 __LF \ + adc x5, x5, x6 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] __LF \ + stp x4, x5, [P0+32] + +S2N_BN_SYMBOL(p384_montjdouble_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_p384(z2,z_1) + montsqr_p384(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + weakadd_p384(t1,x_1,z2) + sub_p384(t2,x_1,z2) + montmul_p384(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p384(t1,y_1,z_1) + montsqr_p384(x4p,x2p) + montmul_p384(xy2,x_1,y2) + +// t2 = (y + z)^2 + + montsqr_p384(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p384(d,12,xy2,9,x4p) + sub_p384(t1,t2,z2) + +// y4 = y^4 + + montsqr_p384(y4,y2) + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p384(z_3,t1,y2) + montmul_p384(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p384(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p384(y_3,dx2,y4) + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjmixadd.S new file mode 100644 index 00000000000..6c7b121fd84 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjmixadd.S @@ -0,0 +1,876 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjmixadd +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x24 +#define input_x x25 +#define input_y x26 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds to bignum_montmul_p384 except x24 -> x0 + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P1+32] __LF \ + ldp x9, x10, [P2] __LF \ + ldp x11, x12, [P2+16] __LF \ + ldp x13, x14, [P2+32] __LF \ + mul x15, x3, x9 __LF \ + mul x21, x4, x10 __LF \ + mul x22, x5, x11 __LF \ + umulh x23, x3, x9 __LF \ + umulh x0, x4, x10 __LF \ + umulh x1, x5, x11 __LF \ + adds x23, x23, x21 __LF \ + adcs x0, x0, x22 __LF \ + adc x1, x1, xzr __LF \ + adds x16, x23, x15 __LF \ + adcs x17, x0, x23 __LF \ + adcs x19, x1, x0 __LF \ + adc x20, x1, xzr __LF \ + adds x17, x17, x15 __LF \ + adcs x19, x19, x23 __LF \ + adcs x20, x20, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x0, x3, x4 __LF \ + cneg x0, x0, lo __LF \ + csetm x23, lo __LF \ + subs x22, x10, x9 __LF \ + cneg x22, x22, lo __LF \ + mul x21, x0, x22 __LF \ + umulh x22, x0, x22 __LF \ + cinv x23, x23, lo __LF \ + eor x21, x21, x23 __LF \ + eor x22, x22, x23 __LF \ + cmn x23, #1 __LF \ + adcs x16, x16, x21 __LF \ + adcs x17, x17, x22 __LF \ + adcs x19, x19, x23 __LF \ + adcs x20, x20, x23 __LF \ + adc x1, x1, x23 __LF \ + subs x0, x3, x5 __LF \ + cneg x0, x0, lo __LF \ + csetm x23, lo __LF \ + subs x22, x11, x9 __LF \ + cneg x22, x22, lo __LF \ + mul x21, x0, x22 __LF \ + umulh x22, x0, x22 __LF \ + cinv x23, x23, lo __LF \ + eor x21, x21, x23 __LF \ + eor x22, x22, x23 __LF \ + cmn x23, #1 __LF \ + adcs x17, x17, x21 __LF \ + adcs x19, x19, x22 __LF \ + adcs x20, x20, x23 __LF \ + adc x1, x1, x23 __LF \ + subs x0, x4, x5 __LF \ + cneg x0, x0, lo __LF \ + csetm x23, lo __LF \ + subs x22, x11, x10 __LF \ + cneg x22, x22, lo __LF \ + mul x21, x0, x22 __LF \ + umulh x22, x0, x22 __LF \ + cinv x23, x23, lo __LF \ + eor x21, x21, x23 __LF \ + eor x22, x22, x23 __LF \ + cmn x23, #1 __LF \ + adcs x19, x19, x21 __LF \ + adcs x20, x20, x22 __LF \ + adc x1, x1, x23 __LF \ + lsl x23, x15, #32 __LF \ + add x15, x23, x15 __LF \ + lsr x23, x15, #32 __LF \ + subs x23, x23, x15 __LF \ + sbc x22, x15, xzr __LF \ + extr x23, x22, x23, #32 __LF \ + lsr x22, x22, #32 __LF \ + adds x22, x22, x15 __LF \ + adc x21, xzr, xzr __LF \ + subs x16, x16, x23 __LF \ + sbcs x17, x17, x22 __LF \ + sbcs x19, x19, x21 __LF \ + sbcs x20, x20, xzr __LF \ + sbcs x1, x1, xzr __LF \ + sbc x15, x15, xzr __LF \ + lsl x23, x16, #32 __LF \ + add x16, x23, x16 __LF \ + lsr x23, x16, #32 __LF \ + subs x23, x23, x16 __LF \ + sbc x22, x16, xzr __LF \ + extr x23, x22, x23, #32 __LF \ + lsr x22, x22, #32 __LF \ + adds x22, x22, x16 __LF \ + adc x21, xzr, xzr __LF \ + subs x17, x17, x23 __LF \ + sbcs x19, x19, x22 __LF \ + sbcs x20, x20, x21 __LF \ + sbcs x1, x1, xzr __LF \ + sbcs x15, x15, xzr __LF \ + sbc x16, x16, xzr __LF \ + lsl x23, x17, #32 __LF \ + add x17, x23, x17 __LF \ + lsr x23, x17, #32 __LF \ + subs x23, x23, x17 __LF \ + sbc x22, x17, xzr __LF \ + extr x23, x22, x23, #32 __LF \ + lsr x22, x22, #32 __LF \ + adds x22, x22, x17 __LF \ + adc x21, xzr, xzr __LF \ + subs x19, x19, x23 __LF \ + sbcs x20, x20, x22 __LF \ + sbcs x1, x1, x21 __LF \ + sbcs x15, x15, xzr __LF \ + sbcs x16, x16, xzr __LF \ + sbc x17, x17, xzr __LF \ + stp x19, x20, [P0] __LF \ + stp x1, x15, [P0+16] __LF \ + stp x16, x17, [P0+32] __LF \ + mul x15, x6, x12 __LF \ + mul x21, x7, x13 __LF \ + mul x22, x8, x14 __LF \ + umulh x23, x6, x12 __LF \ + umulh x0, x7, x13 __LF \ + umulh x1, x8, x14 __LF \ + adds x23, x23, x21 __LF \ + adcs x0, x0, x22 __LF \ + adc x1, x1, xzr __LF \ + adds x16, x23, x15 __LF \ + adcs x17, x0, x23 __LF \ + adcs x19, x1, x0 __LF \ + adc x20, x1, xzr __LF \ + adds x17, x17, x15 __LF \ + adcs x19, x19, x23 __LF \ + adcs x20, x20, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x0, x6, x7 __LF \ + cneg x0, x0, lo __LF \ + csetm x23, lo __LF \ + subs x22, x13, x12 __LF \ + cneg x22, x22, lo __LF \ + mul x21, x0, x22 __LF \ + umulh x22, x0, x22 __LF \ + cinv x23, x23, lo __LF \ + eor x21, x21, x23 __LF \ + eor x22, x22, x23 __LF \ + cmn x23, #1 __LF \ + adcs x16, x16, x21 __LF \ + adcs x17, x17, x22 __LF \ + adcs x19, x19, x23 __LF \ + adcs x20, x20, x23 __LF \ + adc x1, x1, x23 __LF \ + subs x0, x6, x8 __LF \ + cneg x0, x0, lo __LF \ + csetm x23, lo __LF \ + subs x22, x14, x12 __LF \ + cneg x22, x22, lo __LF \ + mul x21, x0, x22 __LF \ + umulh x22, x0, x22 __LF \ + cinv x23, x23, lo __LF \ + eor x21, x21, x23 __LF \ + eor x22, x22, x23 __LF \ + cmn x23, #1 __LF \ + adcs x17, x17, x21 __LF \ + adcs x19, x19, x22 __LF \ + adcs x20, x20, x23 __LF \ + adc x1, x1, x23 __LF \ + subs x0, x7, x8 __LF \ + cneg x0, x0, lo __LF \ + csetm x23, lo __LF \ + subs x22, x14, x13 __LF \ + cneg x22, x22, lo __LF \ + mul x21, x0, x22 __LF \ + umulh x22, x0, x22 __LF \ + cinv x23, x23, lo __LF \ + eor x21, x21, x23 __LF \ + eor x22, x22, x23 __LF \ + cmn x23, #1 __LF \ + adcs x19, x19, x21 __LF \ + adcs x20, x20, x22 __LF \ + adc x1, x1, x23 __LF \ + subs x6, x6, x3 __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x5 __LF \ + ngc x3, xzr __LF \ + cmn x3, #1 __LF \ + eor x6, x6, x3 __LF \ + adcs x6, x6, xzr __LF \ + eor x7, x7, x3 __LF \ + adcs x7, x7, xzr __LF \ + eor x8, x8, x3 __LF \ + adc x8, x8, xzr __LF \ + subs x9, x9, x12 __LF \ + sbcs x10, x10, x13 __LF \ + sbcs x11, x11, x14 __LF \ + ngc x14, xzr __LF \ + cmn x14, #1 __LF \ + eor x9, x9, x14 __LF \ + adcs x9, x9, xzr __LF \ + eor x10, x10, x14 __LF \ + adcs x10, x10, xzr __LF \ + eor x11, x11, x14 __LF \ + adc x11, x11, xzr __LF \ + eor x14, x3, x14 __LF \ + ldp x21, x22, [P0] __LF \ + adds x15, x15, x21 __LF \ + adcs x16, x16, x22 __LF \ + ldp x21, x22, [P0+16] __LF \ + adcs x17, x17, x21 __LF \ + adcs x19, x19, x22 __LF \ + ldp x21, x22, [P0+32] __LF \ + adcs x20, x20, x21 __LF \ + adcs x1, x1, x22 __LF \ + adc x2, xzr, xzr __LF \ + stp x15, x16, [P0] __LF \ + stp x17, x19, [P0+16] __LF \ + stp x20, x1, [P0+32] __LF \ + mul x15, x6, x9 __LF \ + mul x21, x7, x10 __LF \ + mul x22, x8, x11 __LF \ + umulh x23, x6, x9 __LF \ + umulh x0, x7, x10 __LF \ + umulh x1, x8, x11 __LF \ + adds x23, x23, x21 __LF \ + adcs x0, x0, x22 __LF \ + adc x1, x1, xzr __LF \ + adds x16, x23, x15 __LF \ + adcs x17, x0, x23 __LF \ + adcs x19, x1, x0 __LF \ + adc x20, x1, xzr __LF \ + adds x17, x17, x15 __LF \ + adcs x19, x19, x23 __LF \ + adcs x20, x20, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x0, x6, x7 __LF \ + cneg x0, x0, lo __LF \ + csetm x23, lo __LF \ + subs x22, x10, x9 __LF \ + cneg x22, x22, lo __LF \ + mul x21, x0, x22 __LF \ + umulh x22, x0, x22 __LF \ + cinv x23, x23, lo __LF \ + eor x21, x21, x23 __LF \ + eor x22, x22, x23 __LF \ + cmn x23, #1 __LF \ + adcs x16, x16, x21 __LF \ + adcs x17, x17, x22 __LF \ + adcs x19, x19, x23 __LF \ + adcs x20, x20, x23 __LF \ + adc x1, x1, x23 __LF \ + subs x0, x6, x8 __LF \ + cneg x0, x0, lo __LF \ + csetm x23, lo __LF \ + subs x22, x11, x9 __LF \ + cneg x22, x22, lo __LF \ + mul x21, x0, x22 __LF \ + umulh x22, x0, x22 __LF \ + cinv x23, x23, lo __LF \ + eor x21, x21, x23 __LF \ + eor x22, x22, x23 __LF \ + cmn x23, #1 __LF \ + adcs x17, x17, x21 __LF \ + adcs x19, x19, x22 __LF \ + adcs x20, x20, x23 __LF \ + adc x1, x1, x23 __LF \ + subs x0, x7, x8 __LF \ + cneg x0, x0, lo __LF \ + csetm x23, lo __LF \ + subs x22, x11, x10 __LF \ + cneg x22, x22, lo __LF \ + mul x21, x0, x22 __LF \ + umulh x22, x0, x22 __LF \ + cinv x23, x23, lo __LF \ + eor x21, x21, x23 __LF \ + eor x22, x22, x23 __LF \ + cmn x23, #1 __LF \ + adcs x19, x19, x21 __LF \ + adcs x20, x20, x22 __LF \ + adc x1, x1, x23 __LF \ + ldp x3, x4, [P0] __LF \ + ldp x5, x6, [P0+16] __LF \ + ldp x7, x8, [P0+32] __LF \ + cmn x14, #1 __LF \ + eor x15, x15, x14 __LF \ + adcs x15, x15, x3 __LF \ + eor x16, x16, x14 __LF \ + adcs x16, x16, x4 __LF \ + eor x17, x17, x14 __LF \ + adcs x17, x17, x5 __LF \ + eor x19, x19, x14 __LF \ + adcs x19, x19, x6 __LF \ + eor x20, x20, x14 __LF \ + adcs x20, x20, x7 __LF \ + eor x1, x1, x14 __LF \ + adcs x1, x1, x8 __LF \ + adcs x9, x14, x2 __LF \ + adcs x10, x14, xzr __LF \ + adcs x11, x14, xzr __LF \ + adc x12, x14, xzr __LF \ + adds x19, x19, x3 __LF \ + adcs x20, x20, x4 __LF \ + adcs x1, x1, x5 __LF \ + adcs x9, x9, x6 __LF \ + adcs x10, x10, x7 __LF \ + adcs x11, x11, x8 __LF \ + adc x12, x12, x2 __LF \ + lsl x23, x15, #32 __LF \ + add x15, x23, x15 __LF \ + lsr x23, x15, #32 __LF \ + subs x23, x23, x15 __LF \ + sbc x22, x15, xzr __LF \ + extr x23, x22, x23, #32 __LF \ + lsr x22, x22, #32 __LF \ + adds x22, x22, x15 __LF \ + adc x21, xzr, xzr __LF \ + subs x16, x16, x23 __LF \ + sbcs x17, x17, x22 __LF \ + sbcs x19, x19, x21 __LF \ + sbcs x20, x20, xzr __LF \ + sbcs x1, x1, xzr __LF \ + sbc x15, x15, xzr __LF \ + lsl x23, x16, #32 __LF \ + add x16, x23, x16 __LF \ + lsr x23, x16, #32 __LF \ + subs x23, x23, x16 __LF \ + sbc x22, x16, xzr __LF \ + extr x23, x22, x23, #32 __LF \ + lsr x22, x22, #32 __LF \ + adds x22, x22, x16 __LF \ + adc x21, xzr, xzr __LF \ + subs x17, x17, x23 __LF \ + sbcs x19, x19, x22 __LF \ + sbcs x20, x20, x21 __LF \ + sbcs x1, x1, xzr __LF \ + sbcs x15, x15, xzr __LF \ + sbc x16, x16, xzr __LF \ + lsl x23, x17, #32 __LF \ + add x17, x23, x17 __LF \ + lsr x23, x17, #32 __LF \ + subs x23, x23, x17 __LF \ + sbc x22, x17, xzr __LF \ + extr x23, x22, x23, #32 __LF \ + lsr x22, x22, #32 __LF \ + adds x22, x22, x17 __LF \ + adc x21, xzr, xzr __LF \ + subs x19, x19, x23 __LF \ + sbcs x20, x20, x22 __LF \ + sbcs x1, x1, x21 __LF \ + sbcs x15, x15, xzr __LF \ + sbcs x16, x16, xzr __LF \ + sbc x17, x17, xzr __LF \ + adds x9, x9, x15 __LF \ + adcs x10, x10, x16 __LF \ + adcs x11, x11, x17 __LF \ + adc x12, x12, xzr __LF \ + add x22, x12, #1 __LF \ + lsl x21, x22, #32 __LF \ + subs x0, x22, x21 __LF \ + sbc x21, x21, xzr __LF \ + adds x19, x19, x0 __LF \ + adcs x20, x20, x21 __LF \ + adcs x1, x1, x22 __LF \ + adcs x9, x9, xzr __LF \ + adcs x10, x10, xzr __LF \ + adcs x11, x11, xzr __LF \ + csetm x22, lo __LF \ + mov x23, #4294967295 __LF \ + and x23, x23, x22 __LF \ + adds x19, x19, x23 __LF \ + eor x23, x23, x22 __LF \ + adcs x20, x20, x23 __LF \ + mov x23, #-2 __LF \ + and x23, x23, x22 __LF \ + adcs x1, x1, x23 __LF \ + adcs x9, x9, x22 __LF \ + adcs x10, x10, x22 __LF \ + adc x11, x11, x22 __LF \ + stp x19, x20, [P0] __LF \ + stp x1, x9, [P0+16] __LF \ + stp x10, x11, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384 + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1] __LF \ + ldp x4, x5, [P1+16] __LF \ + ldp x6, x7, [P1+32] __LF \ + mul x14, x2, x3 __LF \ + mul x15, x2, x4 __LF \ + mul x16, x3, x4 __LF \ + mul x8, x2, x2 __LF \ + mul x10, x3, x3 __LF \ + mul x12, x4, x4 __LF \ + umulh x17, x2, x3 __LF \ + adds x15, x15, x17 __LF \ + umulh x17, x2, x4 __LF \ + adcs x16, x16, x17 __LF \ + umulh x17, x3, x4 __LF \ + adcs x17, x17, xzr __LF \ + umulh x9, x2, x2 __LF \ + umulh x11, x3, x3 __LF \ + umulh x13, x4, x4 __LF \ + adds x14, x14, x14 __LF \ + adcs x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adcs x17, x17, x17 __LF \ + adc x13, x13, xzr __LF \ + adds x9, x9, x14 __LF \ + adcs x10, x10, x15 __LF \ + adcs x11, x11, x16 __LF \ + adcs x12, x12, x17 __LF \ + adc x13, x13, xzr __LF \ + lsl x16, x8, #32 __LF \ + add x8, x16, x8 __LF \ + lsr x16, x8, #32 __LF \ + subs x16, x16, x8 __LF \ + sbc x15, x8, xzr __LF \ + extr x16, x15, x16, #32 __LF \ + lsr x15, x15, #32 __LF \ + adds x15, x15, x8 __LF \ + adc x14, xzr, xzr __LF \ + subs x9, x9, x16 __LF \ + sbcs x10, x10, x15 __LF \ + sbcs x11, x11, x14 __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbc x8, x8, xzr __LF \ + lsl x16, x9, #32 __LF \ + add x9, x16, x9 __LF \ + lsr x16, x9, #32 __LF \ + subs x16, x16, x9 __LF \ + sbc x15, x9, xzr __LF \ + extr x16, x15, x16, #32 __LF \ + lsr x15, x15, #32 __LF \ + adds x15, x15, x9 __LF \ + adc x14, xzr, xzr __LF \ + subs x10, x10, x16 __LF \ + sbcs x11, x11, x15 __LF \ + sbcs x12, x12, x14 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x8, x8, xzr __LF \ + sbc x9, x9, xzr __LF \ + lsl x16, x10, #32 __LF \ + add x10, x16, x10 __LF \ + lsr x16, x10, #32 __LF \ + subs x16, x16, x10 __LF \ + sbc x15, x10, xzr __LF \ + extr x16, x15, x16, #32 __LF \ + lsr x15, x15, #32 __LF \ + adds x15, x15, x10 __LF \ + adc x14, xzr, xzr __LF \ + subs x11, x11, x16 __LF \ + sbcs x12, x12, x15 __LF \ + sbcs x13, x13, x14 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + stp x11, x12, [P0] __LF \ + stp x13, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] __LF \ + mul x8, x2, x5 __LF \ + mul x14, x3, x6 __LF \ + mul x15, x4, x7 __LF \ + umulh x16, x2, x5 __LF \ + umulh x17, x3, x6 __LF \ + umulh x1, x4, x7 __LF \ + adds x16, x16, x14 __LF \ + adcs x17, x17, x15 __LF \ + adc x1, x1, xzr __LF \ + adds x9, x16, x8 __LF \ + adcs x10, x17, x16 __LF \ + adcs x11, x1, x17 __LF \ + adc x12, x1, xzr __LF \ + adds x10, x10, x8 __LF \ + adcs x11, x11, x16 __LF \ + adcs x12, x12, x17 __LF \ + adc x13, x1, xzr __LF \ + subs x17, x2, x3 __LF \ + cneg x17, x17, lo __LF \ + csetm x14, lo __LF \ + subs x15, x6, x5 __LF \ + cneg x15, x15, lo __LF \ + mul x16, x17, x15 __LF \ + umulh x15, x17, x15 __LF \ + cinv x14, x14, lo __LF \ + eor x16, x16, x14 __LF \ + eor x15, x15, x14 __LF \ + cmn x14, #1 __LF \ + adcs x9, x9, x16 __LF \ + adcs x10, x10, x15 __LF \ + adcs x11, x11, x14 __LF \ + adcs x12, x12, x14 __LF \ + adc x13, x13, x14 __LF \ + subs x17, x2, x4 __LF \ + cneg x17, x17, lo __LF \ + csetm x14, lo __LF \ + subs x15, x7, x5 __LF \ + cneg x15, x15, lo __LF \ + mul x16, x17, x15 __LF \ + umulh x15, x17, x15 __LF \ + cinv x14, x14, lo __LF \ + eor x16, x16, x14 __LF \ + eor x15, x15, x14 __LF \ + cmn x14, #1 __LF \ + adcs x10, x10, x16 __LF \ + adcs x11, x11, x15 __LF \ + adcs x12, x12, x14 __LF \ + adc x13, x13, x14 __LF \ + subs x17, x3, x4 __LF \ + cneg x17, x17, lo __LF \ + csetm x14, lo __LF \ + subs x15, x7, x6 __LF \ + cneg x15, x15, lo __LF \ + mul x16, x17, x15 __LF \ + umulh x15, x17, x15 __LF \ + cinv x14, x14, lo __LF \ + eor x16, x16, x14 __LF \ + eor x15, x15, x14 __LF \ + cmn x14, #1 __LF \ + adcs x11, x11, x16 __LF \ + adcs x12, x12, x15 __LF \ + adc x13, x13, x14 __LF \ + adds x8, x8, x8 __LF \ + adcs x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adc x17, xzr, xzr __LF \ + ldp x2, x3, [P0] __LF \ + adds x8, x8, x2 __LF \ + adcs x9, x9, x3 __LF \ + ldp x2, x3, [P0+16] __LF \ + adcs x10, x10, x2 __LF \ + adcs x11, x11, x3 __LF \ + ldp x2, x3, [P0+32] __LF \ + adcs x12, x12, x2 __LF \ + adcs x13, x13, x3 __LF \ + adc x17, x17, xzr __LF \ + lsl x4, x8, #32 __LF \ + add x8, x4, x8 __LF \ + lsr x4, x8, #32 __LF \ + subs x4, x4, x8 __LF \ + sbc x3, x8, xzr __LF \ + extr x4, x3, x4, #32 __LF \ + lsr x3, x3, #32 __LF \ + adds x3, x3, x8 __LF \ + adc x2, xzr, xzr __LF \ + subs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + sbcs x11, x11, x2 __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbc x8, x8, xzr __LF \ + lsl x4, x9, #32 __LF \ + add x9, x4, x9 __LF \ + lsr x4, x9, #32 __LF \ + subs x4, x4, x9 __LF \ + sbc x3, x9, xzr __LF \ + extr x4, x3, x4, #32 __LF \ + lsr x3, x3, #32 __LF \ + adds x3, x3, x9 __LF \ + adc x2, xzr, xzr __LF \ + subs x10, x10, x4 __LF \ + sbcs x11, x11, x3 __LF \ + sbcs x12, x12, x2 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x8, x8, xzr __LF \ + sbc x9, x9, xzr __LF \ + lsl x4, x10, #32 __LF \ + add x10, x4, x10 __LF \ + lsr x4, x10, #32 __LF \ + subs x4, x4, x10 __LF \ + sbc x3, x10, xzr __LF \ + extr x4, x3, x4, #32 __LF \ + lsr x3, x3, #32 __LF \ + adds x3, x3, x10 __LF \ + adc x2, xzr, xzr __LF \ + subs x11, x11, x4 __LF \ + sbcs x12, x12, x3 __LF \ + sbcs x13, x13, x2 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + adds x17, x17, x8 __LF \ + adcs x8, x9, xzr __LF \ + adcs x9, x10, xzr __LF \ + adcs x10, xzr, xzr __LF \ + mul x1, x5, x5 __LF \ + adds x11, x11, x1 __LF \ + mul x14, x6, x6 __LF \ + mul x15, x7, x7 __LF \ + umulh x1, x5, x5 __LF \ + adcs x12, x12, x1 __LF \ + umulh x1, x6, x6 __LF \ + adcs x13, x13, x14 __LF \ + adcs x17, x17, x1 __LF \ + umulh x1, x7, x7 __LF \ + adcs x8, x8, x15 __LF \ + adcs x9, x9, x1 __LF \ + adc x10, x10, xzr __LF \ + mul x1, x5, x6 __LF \ + mul x14, x5, x7 __LF \ + mul x15, x6, x7 __LF \ + umulh x16, x5, x6 __LF \ + adds x14, x14, x16 __LF \ + umulh x16, x5, x7 __LF \ + adcs x15, x15, x16 __LF \ + umulh x16, x6, x7 __LF \ + adc x16, x16, xzr __LF \ + adds x1, x1, x1 __LF \ + adcs x14, x14, x14 __LF \ + adcs x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x5, xzr, xzr __LF \ + adds x12, x12, x1 __LF \ + adcs x13, x13, x14 __LF \ + adcs x17, x17, x15 __LF \ + adcs x8, x8, x16 __LF \ + adcs x9, x9, x5 __LF \ + adc x10, x10, xzr __LF \ + mov x1, #-4294967295 __LF \ + mov x14, #4294967295 __LF \ + mov x15, #1 __LF \ + cmn x11, x1 __LF \ + adcs xzr, x12, x14 __LF \ + adcs xzr, x13, x15 __LF \ + adcs xzr, x17, xzr __LF \ + adcs xzr, x8, xzr __LF \ + adcs xzr, x9, xzr __LF \ + adc x10, x10, xzr __LF \ + neg x10, x10 __LF \ + and x1, x1, x10 __LF \ + adds x11, x11, x1 __LF \ + and x14, x14, x10 __LF \ + adcs x12, x12, x14 __LF \ + and x15, x15, x10 __LF \ + adcs x13, x13, x15 __LF \ + adcs x17, x17, xzr __LF \ + adcs x8, x8, xzr __LF \ + adc x9, x9, xzr __LF \ + stp x11, x12, [P0] __LF \ + stp x13, x17, [P0+16] __LF \ + stp x8, x9, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + csetm x3, lo __LF \ + mov x4, #4294967295 __LF \ + and x4, x4, x3 __LF \ + adds x5, x5, x4 __LF \ + eor x4, x4, x3 __LF \ + adcs x6, x6, x4 __LF \ + mov x4, #-2 __LF \ + and x4, x4, x3 __LF \ + adcs x7, x7, x4 __LF \ + adcs x8, x8, x3 __LF \ + adcs x9, x9, x3 __LF \ + adc x10, x10, x3 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] + +S2N_BN_SYMBOL(p384_montjmixadd): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + montsqr_p384(zp2,z_1) + montmul_p384(y2a,z_1,y_2) + + montmul_p384(x2a,zp2,x_2) + montmul_p384(y2a,zp2,y2a) + + sub_p384(xd,x2a,x_1) + sub_p384(yd,y2a,y_1) + + montsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + montmul_p384(zzx1,zz,x_1) + montmul_p384(zzx2,zz,x2a) + + sub_p384(resx,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + montmul_p384(resz,xd,z_1) + + sub_p384(resx,resx,zzx2) + + sub_p384(t2,zzx1,resx) + + montmul_p384(t1,t1,y_1) + montmul_p384(t2,yd,t2) + + sub_p384(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + orr x6, x0, x1 + orr x7, x2, x3 + orr x8, x4, x5 + orr x6, x6, x7 + orr x6, x6, x8 + cmp x6, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^384 - p_384), +// hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x19, x20, [x_2] + csel x0, x0, x19, ne + csel x1, x1, x20, ne + ldp x2, x3, [resx+16] + ldp x19, x20, [x_2+16] + csel x2, x2, x19, ne + csel x3, x3, x20, ne + ldp x4, x5, [resx+32] + ldp x19, x20, [x_2+32] + csel x4, x4, x19, ne + csel x5, x5, x20, ne + + ldp x6, x7, [resy] + ldp x19, x20, [y_2] + csel x6, x6, x19, ne + csel x7, x7, x20, ne + ldp x8, x9, [resy+16] + ldp x19, x20, [y_2+16] + csel x8, x8, x19, ne + csel x9, x9, x20, ne + ldp x10, x11, [resy+32] + ldp x19, x20, [y_2+32] + csel x10, x10, x19, ne + csel x11, x11, x20, ne + + ldp x12, x13, [resz] + mov x19, #0xffffffff00000001 + mov x20, #0x00000000ffffffff + csel x12, x12, x19, ne + csel x13, x13, x20, ne + ldp x14, x15, [resz+16] + mov x19, #1 + csel x14, x14, x19, ne + csel x15, x15, xzr, ne + ldp x16, x17, [resz+32] + csel x16, x16, xzr, ne + csel x17, x17, xzr, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [y_3] + stp x8, x9, [y_3+16] + stp x10, x11, [y_3+32] + stp x12, x13, [z_3] + stp x14, x15, [z_3+16] + stp x16, x17, [z_3+32] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjmixadd_alt.S new file mode 100644 index 00000000000..44756c0bd6f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjmixadd_alt.S @@ -0,0 +1,941 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// +// extern void p384_montjmixadd_alt +// (uint64_t p3[static 18],uint64_t p1[static 18],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^384 * x) mod p_384. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjmixadd_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 + +// Stable homes for input arguments during main code sequence + +#define input_z x24 +#define input_x x25 +#define input_y x26 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_montmul_p384_alt + +#define montmul_p384(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + mul x12, x3, x5 __LF \ + umulh x13, x3, x5 __LF \ + mul x11, x3, x6 __LF \ + umulh x14, x3, x6 __LF \ + adds x13, x13, x11 __LF \ + ldp x7, x8, [P2+16] __LF \ + mul x11, x3, x7 __LF \ + umulh x15, x3, x7 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x8 __LF \ + umulh x16, x3, x8 __LF \ + adcs x15, x15, x11 __LF \ + ldp x9, x10, [P2+32] __LF \ + mul x11, x3, x9 __LF \ + umulh x17, x3, x9 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x19, x3, x10 __LF \ + adcs x17, x17, x11 __LF \ + adc x19, x19, xzr __LF \ + mul x11, x4, x5 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x6 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x7 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x17, x17, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x19, x19, x11 __LF \ + cset x20, cs __LF \ + umulh x11, x4, x5 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x6 __LF \ + adcs x15, x15, x11 __LF \ + umulh x11, x4, x7 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x17, x17, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x19, x19, x11 __LF \ + umulh x11, x4, x10 __LF \ + adc x20, x20, x11 __LF \ + ldp x3, x4, [P1+16] __LF \ + mul x11, x3, x5 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x3, x6 __LF \ + adcs x15, x15, x11 __LF \ + mul x11, x3, x7 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x3, x8 __LF \ + adcs x17, x17, x11 __LF \ + mul x11, x3, x9 __LF \ + adcs x19, x19, x11 __LF \ + mul x11, x3, x10 __LF \ + adcs x20, x20, x11 __LF \ + cset x21, cs __LF \ + umulh x11, x3, x5 __LF \ + adds x15, x15, x11 __LF \ + umulh x11, x3, x6 __LF \ + adcs x16, x16, x11 __LF \ + umulh x11, x3, x7 __LF \ + adcs x17, x17, x11 __LF \ + umulh x11, x3, x8 __LF \ + adcs x19, x19, x11 __LF \ + umulh x11, x3, x9 __LF \ + adcs x20, x20, x11 __LF \ + umulh x11, x3, x10 __LF \ + adc x21, x21, x11 __LF \ + mul x11, x4, x5 __LF \ + adds x15, x15, x11 __LF \ + mul x11, x4, x6 __LF \ + adcs x16, x16, x11 __LF \ + mul x11, x4, x7 __LF \ + adcs x17, x17, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x19, x19, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x20, x20, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x21, x21, x11 __LF \ + cset x22, cs __LF \ + umulh x11, x4, x5 __LF \ + adds x16, x16, x11 __LF \ + umulh x11, x4, x6 __LF \ + adcs x17, x17, x11 __LF \ + umulh x11, x4, x7 __LF \ + adcs x19, x19, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x20, x20, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x21, x21, x11 __LF \ + umulh x11, x4, x10 __LF \ + adc x22, x22, x11 __LF \ + ldp x3, x4, [P1+32] __LF \ + mul x11, x3, x5 __LF \ + adds x16, x16, x11 __LF \ + mul x11, x3, x6 __LF \ + adcs x17, x17, x11 __LF \ + mul x11, x3, x7 __LF \ + adcs x19, x19, x11 __LF \ + mul x11, x3, x8 __LF \ + adcs x20, x20, x11 __LF \ + mul x11, x3, x9 __LF \ + adcs x21, x21, x11 __LF \ + mul x11, x3, x10 __LF \ + adcs x22, x22, x11 __LF \ + cset x2, cs __LF \ + umulh x11, x3, x5 __LF \ + adds x17, x17, x11 __LF \ + umulh x11, x3, x6 __LF \ + adcs x19, x19, x11 __LF \ + umulh x11, x3, x7 __LF \ + adcs x20, x20, x11 __LF \ + umulh x11, x3, x8 __LF \ + adcs x21, x21, x11 __LF \ + umulh x11, x3, x9 __LF \ + adcs x22, x22, x11 __LF \ + umulh x11, x3, x10 __LF \ + adc x2, x2, x11 __LF \ + mul x11, x4, x5 __LF \ + adds x17, x17, x11 __LF \ + mul x11, x4, x6 __LF \ + adcs x19, x19, x11 __LF \ + mul x11, x4, x7 __LF \ + adcs x20, x20, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x21, x21, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x22, x22, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x2, x2, x11 __LF \ + cset x1, cs __LF \ + umulh x11, x4, x5 __LF \ + adds x19, x19, x11 __LF \ + umulh x11, x4, x6 __LF \ + adcs x20, x20, x11 __LF \ + umulh x11, x4, x7 __LF \ + adcs x21, x21, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x22, x22, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x2, x2, x11 __LF \ + umulh x11, x4, x10 __LF \ + adc x1, x1, x11 __LF \ + lsl x7, x12, #32 __LF \ + add x12, x7, x12 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x12 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x12 __LF \ + umulh x6, x6, x12 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x12 __LF \ + adc x5, xzr, xzr __LF \ + subs x13, x13, x7 __LF \ + sbcs x14, x14, x6 __LF \ + sbcs x15, x15, x5 __LF \ + sbcs x16, x16, xzr __LF \ + sbcs x17, x17, xzr __LF \ + sbc x12, x12, xzr __LF \ + lsl x7, x13, #32 __LF \ + add x13, x7, x13 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x13 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x13 __LF \ + umulh x6, x6, x13 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x13 __LF \ + adc x5, xzr, xzr __LF \ + subs x14, x14, x7 __LF \ + sbcs x15, x15, x6 __LF \ + sbcs x16, x16, x5 __LF \ + sbcs x17, x17, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, xzr __LF \ + lsl x7, x14, #32 __LF \ + add x14, x7, x14 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x14 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x14 __LF \ + umulh x6, x6, x14 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x14 __LF \ + adc x5, xzr, xzr __LF \ + subs x15, x15, x7 __LF \ + sbcs x16, x16, x6 __LF \ + sbcs x17, x17, x5 __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbc x14, x14, xzr __LF \ + lsl x7, x15, #32 __LF \ + add x15, x7, x15 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x15 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x15 __LF \ + umulh x6, x6, x15 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x15 __LF \ + adc x5, xzr, xzr __LF \ + subs x16, x16, x7 __LF \ + sbcs x17, x17, x6 __LF \ + sbcs x12, x12, x5 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x15, x15, xzr __LF \ + lsl x7, x16, #32 __LF \ + add x16, x7, x16 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x16 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x16 __LF \ + umulh x6, x6, x16 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x16 __LF \ + adc x5, xzr, xzr __LF \ + subs x17, x17, x7 __LF \ + sbcs x12, x12, x6 __LF \ + sbcs x13, x13, x5 __LF \ + sbcs x14, x14, xzr __LF \ + sbcs x15, x15, xzr __LF \ + sbc x16, x16, xzr __LF \ + lsl x7, x17, #32 __LF \ + add x17, x7, x17 __LF \ + mov x7, #0xffffffff00000001 __LF \ + umulh x7, x7, x17 __LF \ + mov x6, #0xffffffff __LF \ + mul x5, x6, x17 __LF \ + umulh x6, x6, x17 __LF \ + adds x7, x7, x5 __LF \ + adcs x6, x6, x17 __LF \ + adc x5, xzr, xzr __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, x6 __LF \ + sbcs x14, x14, x5 __LF \ + sbcs x15, x15, xzr __LF \ + sbcs x16, x16, xzr __LF \ + sbc x17, x17, xzr __LF \ + adds x12, x12, x19 __LF \ + adcs x13, x13, x20 __LF \ + adcs x14, x14, x21 __LF \ + adcs x15, x15, x22 __LF \ + adcs x16, x16, x2 __LF \ + adcs x17, x17, x1 __LF \ + adc x10, xzr, xzr __LF \ + mov x11, #0xffffffff00000001 __LF \ + adds x19, x12, x11 __LF \ + mov x11, #0xffffffff __LF \ + adcs x20, x13, x11 __LF \ + mov x11, #0x1 __LF \ + adcs x21, x14, x11 __LF \ + adcs x22, x15, xzr __LF \ + adcs x2, x16, xzr __LF \ + adcs x1, x17, xzr __LF \ + adcs x10, x10, xzr __LF \ + csel x12, x12, x19, eq __LF \ + csel x13, x13, x20, eq __LF \ + csel x14, x14, x21, eq __LF \ + csel x15, x15, x22, eq __LF \ + csel x16, x16, x2, eq __LF \ + csel x17, x17, x1, eq __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x15, [P0+16] __LF \ + stp x16, x17, [P0+32] + +// Corresponds exactly to bignum_montsqr_p384_alt + +#define montsqr_p384(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x8, x2, x4 __LF \ + adds x10, x10, x8 __LF \ + mul x11, x2, x5 __LF \ + mul x8, x3, x4 __LF \ + adcs x11, x11, x8 __LF \ + umulh x12, x2, x5 __LF \ + mul x8, x3, x5 __LF \ + adcs x12, x12, x8 __LF \ + ldp x6, x7, [P1+32] __LF \ + mul x13, x2, x7 __LF \ + mul x8, x3, x6 __LF \ + adcs x13, x13, x8 __LF \ + umulh x14, x2, x7 __LF \ + mul x8, x3, x7 __LF \ + adcs x14, x14, x8 __LF \ + mul x15, x5, x6 __LF \ + adcs x15, x15, xzr __LF \ + umulh x16, x5, x6 __LF \ + adc x16, x16, xzr __LF \ + umulh x8, x2, x4 __LF \ + adds x11, x11, x8 __LF \ + umulh x8, x3, x4 __LF \ + adcs x12, x12, x8 __LF \ + umulh x8, x3, x5 __LF \ + adcs x13, x13, x8 __LF \ + umulh x8, x3, x6 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x3, x7 __LF \ + adcs x15, x15, x8 __LF \ + adc x16, x16, xzr __LF \ + mul x8, x2, x6 __LF \ + adds x12, x12, x8 __LF \ + mul x8, x4, x5 __LF \ + adcs x13, x13, x8 __LF \ + mul x8, x4, x6 __LF \ + adcs x14, x14, x8 __LF \ + mul x8, x4, x7 __LF \ + adcs x15, x15, x8 __LF \ + mul x8, x5, x7 __LF \ + adcs x16, x16, x8 __LF \ + mul x17, x6, x7 __LF \ + adcs x17, x17, xzr __LF \ + umulh x19, x6, x7 __LF \ + adc x19, x19, xzr __LF \ + umulh x8, x2, x6 __LF \ + adds x13, x13, x8 __LF \ + umulh x8, x4, x5 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x4, x6 __LF \ + adcs x15, x15, x8 __LF \ + umulh x8, x4, x7 __LF \ + adcs x16, x16, x8 __LF \ + umulh x8, x5, x7 __LF \ + adcs x17, x17, x8 __LF \ + adc x19, x19, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + adcs x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adcs x17, x17, x17 __LF \ + adcs x19, x19, x19 __LF \ + cset x20, hs __LF \ + umulh x8, x2, x2 __LF \ + mul x2, x2, x2 __LF \ + adds x9, x9, x8 __LF \ + mul x8, x3, x3 __LF \ + adcs x10, x10, x8 __LF \ + umulh x8, x3, x3 __LF \ + adcs x11, x11, x8 __LF \ + mul x8, x4, x4 __LF \ + adcs x12, x12, x8 __LF \ + umulh x8, x4, x4 __LF \ + adcs x13, x13, x8 __LF \ + mul x8, x5, x5 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x5, x5 __LF \ + adcs x15, x15, x8 __LF \ + mul x8, x6, x6 __LF \ + adcs x16, x16, x8 __LF \ + umulh x8, x6, x6 __LF \ + adcs x17, x17, x8 __LF \ + mul x8, x7, x7 __LF \ + adcs x19, x19, x8 __LF \ + umulh x8, x7, x7 __LF \ + adc x20, x20, x8 __LF \ + lsl x5, x2, #32 __LF \ + add x2, x5, x2 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x2 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x2 __LF \ + umulh x4, x4, x2 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x2 __LF \ + adc x3, xzr, xzr __LF \ + subs x9, x9, x5 __LF \ + sbcs x10, x10, x4 __LF \ + sbcs x11, x11, x3 __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbc x2, x2, xzr __LF \ + lsl x5, x9, #32 __LF \ + add x9, x5, x9 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x9 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x9 __LF \ + umulh x4, x4, x9 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x9 __LF \ + adc x3, xzr, xzr __LF \ + subs x10, x10, x5 __LF \ + sbcs x11, x11, x4 __LF \ + sbcs x12, x12, x3 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x2, x2, xzr __LF \ + sbc x9, x9, xzr __LF \ + lsl x5, x10, #32 __LF \ + add x10, x5, x10 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x10 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x10 __LF \ + umulh x4, x4, x10 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x10 __LF \ + adc x3, xzr, xzr __LF \ + subs x11, x11, x5 __LF \ + sbcs x12, x12, x4 __LF \ + sbcs x13, x13, x3 __LF \ + sbcs x2, x2, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + lsl x5, x11, #32 __LF \ + add x11, x5, x11 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x11 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x11 __LF \ + umulh x4, x4, x11 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x11 __LF \ + adc x3, xzr, xzr __LF \ + subs x12, x12, x5 __LF \ + sbcs x13, x13, x4 __LF \ + sbcs x2, x2, x3 __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbc x11, x11, xzr __LF \ + lsl x5, x12, #32 __LF \ + add x12, x5, x12 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x12 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x12 __LF \ + umulh x4, x4, x12 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x12 __LF \ + adc x3, xzr, xzr __LF \ + subs x13, x13, x5 __LF \ + sbcs x2, x2, x4 __LF \ + sbcs x9, x9, x3 __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbc x12, x12, xzr __LF \ + lsl x5, x13, #32 __LF \ + add x13, x5, x13 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x13 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x13 __LF \ + umulh x4, x4, x13 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x13 __LF \ + adc x3, xzr, xzr __LF \ + subs x2, x2, x5 __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, xzr __LF \ + adds x2, x2, x14 __LF \ + adcs x9, x9, x15 __LF \ + adcs x10, x10, x16 __LF \ + adcs x11, x11, x17 __LF \ + adcs x12, x12, x19 __LF \ + adcs x13, x13, x20 __LF \ + adc x6, xzr, xzr __LF \ + mov x8, #-4294967295 __LF \ + adds x14, x2, x8 __LF \ + mov x8, #4294967295 __LF \ + adcs x15, x9, x8 __LF \ + mov x8, #1 __LF \ + adcs x16, x10, x8 __LF \ + adcs x17, x11, xzr __LF \ + adcs x19, x12, xzr __LF \ + adcs x20, x13, xzr __LF \ + adcs x6, x6, xzr __LF \ + csel x2, x2, x14, eq __LF \ + csel x9, x9, x15, eq __LF \ + csel x10, x10, x16, eq __LF \ + csel x11, x11, x17, eq __LF \ + csel x12, x12, x19, eq __LF \ + csel x13, x13, x20, eq __LF \ + stp x2, x9, [P0] __LF \ + stp x10, x11, [P0+16] __LF \ + stp x12, x13, [P0+32] + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). In +// fact, with the Karatsuba-based Montgomery mul here, we don't even +// *need* the restriction that the other argument is reduced. + +#define amontsqr_p384(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x8, x2, x4 __LF \ + adds x10, x10, x8 __LF \ + mul x11, x2, x5 __LF \ + mul x8, x3, x4 __LF \ + adcs x11, x11, x8 __LF \ + umulh x12, x2, x5 __LF \ + mul x8, x3, x5 __LF \ + adcs x12, x12, x8 __LF \ + ldp x6, x7, [P1+32] __LF \ + mul x13, x2, x7 __LF \ + mul x8, x3, x6 __LF \ + adcs x13, x13, x8 __LF \ + umulh x14, x2, x7 __LF \ + mul x8, x3, x7 __LF \ + adcs x14, x14, x8 __LF \ + mul x15, x5, x6 __LF \ + adcs x15, x15, xzr __LF \ + umulh x16, x5, x6 __LF \ + adc x16, x16, xzr __LF \ + umulh x8, x2, x4 __LF \ + adds x11, x11, x8 __LF \ + umulh x8, x3, x4 __LF \ + adcs x12, x12, x8 __LF \ + umulh x8, x3, x5 __LF \ + adcs x13, x13, x8 __LF \ + umulh x8, x3, x6 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x3, x7 __LF \ + adcs x15, x15, x8 __LF \ + adc x16, x16, xzr __LF \ + mul x8, x2, x6 __LF \ + adds x12, x12, x8 __LF \ + mul x8, x4, x5 __LF \ + adcs x13, x13, x8 __LF \ + mul x8, x4, x6 __LF \ + adcs x14, x14, x8 __LF \ + mul x8, x4, x7 __LF \ + adcs x15, x15, x8 __LF \ + mul x8, x5, x7 __LF \ + adcs x16, x16, x8 __LF \ + mul x17, x6, x7 __LF \ + adcs x17, x17, xzr __LF \ + umulh x19, x6, x7 __LF \ + adc x19, x19, xzr __LF \ + umulh x8, x2, x6 __LF \ + adds x13, x13, x8 __LF \ + umulh x8, x4, x5 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x4, x6 __LF \ + adcs x15, x15, x8 __LF \ + umulh x8, x4, x7 __LF \ + adcs x16, x16, x8 __LF \ + umulh x8, x5, x7 __LF \ + adcs x17, x17, x8 __LF \ + adc x19, x19, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + adcs x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adcs x17, x17, x17 __LF \ + adcs x19, x19, x19 __LF \ + cset x20, hs __LF \ + umulh x8, x2, x2 __LF \ + mul x2, x2, x2 __LF \ + adds x9, x9, x8 __LF \ + mul x8, x3, x3 __LF \ + adcs x10, x10, x8 __LF \ + umulh x8, x3, x3 __LF \ + adcs x11, x11, x8 __LF \ + mul x8, x4, x4 __LF \ + adcs x12, x12, x8 __LF \ + umulh x8, x4, x4 __LF \ + adcs x13, x13, x8 __LF \ + mul x8, x5, x5 __LF \ + adcs x14, x14, x8 __LF \ + umulh x8, x5, x5 __LF \ + adcs x15, x15, x8 __LF \ + mul x8, x6, x6 __LF \ + adcs x16, x16, x8 __LF \ + umulh x8, x6, x6 __LF \ + adcs x17, x17, x8 __LF \ + mul x8, x7, x7 __LF \ + adcs x19, x19, x8 __LF \ + umulh x8, x7, x7 __LF \ + adc x20, x20, x8 __LF \ + lsl x5, x2, #32 __LF \ + add x2, x5, x2 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x2 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x2 __LF \ + umulh x4, x4, x2 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x2 __LF \ + adc x3, xzr, xzr __LF \ + subs x9, x9, x5 __LF \ + sbcs x10, x10, x4 __LF \ + sbcs x11, x11, x3 __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbc x2, x2, xzr __LF \ + lsl x5, x9, #32 __LF \ + add x9, x5, x9 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x9 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x9 __LF \ + umulh x4, x4, x9 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x9 __LF \ + adc x3, xzr, xzr __LF \ + subs x10, x10, x5 __LF \ + sbcs x11, x11, x4 __LF \ + sbcs x12, x12, x3 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x2, x2, xzr __LF \ + sbc x9, x9, xzr __LF \ + lsl x5, x10, #32 __LF \ + add x10, x5, x10 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x10 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x10 __LF \ + umulh x4, x4, x10 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x10 __LF \ + adc x3, xzr, xzr __LF \ + subs x11, x11, x5 __LF \ + sbcs x12, x12, x4 __LF \ + sbcs x13, x13, x3 __LF \ + sbcs x2, x2, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + lsl x5, x11, #32 __LF \ + add x11, x5, x11 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x11 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x11 __LF \ + umulh x4, x4, x11 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x11 __LF \ + adc x3, xzr, xzr __LF \ + subs x12, x12, x5 __LF \ + sbcs x13, x13, x4 __LF \ + sbcs x2, x2, x3 __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbc x11, x11, xzr __LF \ + lsl x5, x12, #32 __LF \ + add x12, x5, x12 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x12 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x12 __LF \ + umulh x4, x4, x12 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x12 __LF \ + adc x3, xzr, xzr __LF \ + subs x13, x13, x5 __LF \ + sbcs x2, x2, x4 __LF \ + sbcs x9, x9, x3 __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbc x12, x12, xzr __LF \ + lsl x5, x13, #32 __LF \ + add x13, x5, x13 __LF \ + mov x5, #-4294967295 __LF \ + umulh x5, x5, x13 __LF \ + mov x4, #4294967295 __LF \ + mul x3, x4, x13 __LF \ + umulh x4, x4, x13 __LF \ + adds x5, x5, x3 __LF \ + adcs x4, x4, x13 __LF \ + adc x3, xzr, xzr __LF \ + subs x2, x2, x5 __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, xzr __LF \ + adds x2, x2, x14 __LF \ + adcs x9, x9, x15 __LF \ + adcs x10, x10, x16 __LF \ + adcs x11, x11, x17 __LF \ + adcs x12, x12, x19 __LF \ + adcs x13, x13, x20 __LF \ + mov x14, #-4294967295 __LF \ + mov x15, #4294967295 __LF \ + csel x14, x14, xzr, cs __LF \ + csel x15, x15, xzr, cs __LF \ + cset x16, cs __LF \ + adds x2, x2, x14 __LF \ + adcs x9, x9, x15 __LF \ + adcs x10, x10, x16 __LF \ + adcs x11, x11, xzr __LF \ + adcs x12, x12, xzr __LF \ + adc x13, x13, xzr __LF \ + stp x2, x9, [P0] __LF \ + stp x10, x11, [P0+16] __LF \ + stp x12, x13, [P0+32] + +// Corresponds exactly to bignum_sub_p384 + +#define sub_p384(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + csetm x3, lo __LF \ + mov x4, #4294967295 __LF \ + and x4, x4, x3 __LF \ + adds x5, x5, x4 __LF \ + eor x4, x4, x3 __LF \ + adcs x6, x6, x4 __LF \ + mov x4, #-2 __LF \ + and x4, x4, x3 __LF \ + adcs x7, x7, x4 __LF \ + adcs x8, x8, x3 __LF \ + adcs x9, x9, x3 __LF \ + adc x10, x10, x3 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] + +S2N_BN_SYMBOL(p384_montjmixadd_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_p384(zp2,z_1) + montmul_p384(y2a,z_1,y_2) + + montmul_p384(x2a,zp2,x_2) + montmul_p384(y2a,zp2,y2a) + + sub_p384(xd,x2a,x_1) + sub_p384(yd,y2a,y_1) + + amontsqr_p384(zz,xd) + montsqr_p384(ww,yd) + + montmul_p384(zzx1,zz,x_1) + montmul_p384(zzx2,zz,x2a) + + sub_p384(resx,ww,zzx1) + sub_p384(t1,zzx2,zzx1) + + montmul_p384(resz,xd,z_1) + + sub_p384(resx,resx,zzx2) + + sub_p384(t2,zzx1,resx) + + montmul_p384(t1,t1,y_1) + montmul_p384(t2,yd,t2) + + sub_p384(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + orr x6, x0, x1 + orr x7, x2, x3 + orr x8, x4, x5 + orr x6, x6, x7 + orr x6, x6, x8 + cmp x6, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^384 - p_384), +// hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x19, x20, [x_2] + csel x0, x0, x19, ne + csel x1, x1, x20, ne + ldp x2, x3, [resx+16] + ldp x19, x20, [x_2+16] + csel x2, x2, x19, ne + csel x3, x3, x20, ne + ldp x4, x5, [resx+32] + ldp x19, x20, [x_2+32] + csel x4, x4, x19, ne + csel x5, x5, x20, ne + + ldp x6, x7, [resy] + ldp x19, x20, [y_2] + csel x6, x6, x19, ne + csel x7, x7, x20, ne + ldp x8, x9, [resy+16] + ldp x19, x20, [y_2+16] + csel x8, x8, x19, ne + csel x9, x9, x20, ne + ldp x10, x11, [resy+32] + ldp x19, x20, [y_2+32] + csel x10, x10, x19, ne + csel x11, x11, x20, ne + + ldp x12, x13, [resz] + mov x19, #0xffffffff00000001 + mov x20, #0x00000000ffffffff + csel x12, x12, x19, ne + csel x13, x13, x20, ne + ldp x14, x15, [resz+16] + mov x19, #1 + csel x14, x14, x19, ne + csel x15, x15, xzr, ne + ldp x16, x17, [resz+32] + csel x16, x16, xzr, ne + csel x17, x17, xzr, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [y_3] + stp x8, x9, [y_3+16] + stp x10, x11, [y_3+32] + stp x12, x13, [z_3] + stp x14, x15, [z_3+16] + stp x16, x17, [z_3+32] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjscalarmul.S new file mode 100644 index 00000000000..4e92ae69a74 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjscalarmul.S @@ -0,0 +1,9988 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery-Jacobian form scalar multiplication for P-384 +// Input scalar[6], point[18]; output res[18] +// +// extern void p384_montjscalarmul +// (uint64_t res[static 18], +// uint64_t scalar[static 6], +// uint64_t point[static 18]); +// +// This function is a variant of its affine point version p384_scalarmul. +// Here, input and output points are assumed to be in Jacobian form with +// their coordinates in the Montgomery domain. Thus, if priming indicates +// Montgomery form, x' = (2^384 * x) mod p_384 etc., each point argument +// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when +// z' is nonzero or the point at infinity (group identity) if z' = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-384, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_384) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p384_montjscalarmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p384_montjscalarmul) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 48 +#define JACSIZE (3*NUMSIZE) + +// Safe copies of input res and additional values in variables. + +#define bf x22 +#define sgn x23 +#define j x24 +#define res x25 + +// Intermediate variables on the stack. +// The table is 16 entries, each of size JACSIZE = 3 * NUMSIZE + +#define scalarb sp, #(0*NUMSIZE) +#define acc sp, #(1*NUMSIZE) +#define tabent sp, #(4*NUMSIZE) + +#define tab sp, #(7*NUMSIZE) + +#define NSPACE #(55*NUMSIZE) + +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmp bf, #(1*I) __LF \ + ldp x20, x21, [x19] __LF \ + csel x0, x20, x0, eq __LF \ + csel x1, x21, x1, eq __LF \ + ldp x20, x21, [x19, #16] __LF \ + csel x2, x20, x2, eq __LF \ + csel x3, x21, x3, eq __LF \ + ldp x20, x21, [x19, #32] __LF \ + csel x4, x20, x4, eq __LF \ + csel x5, x21, x5, eq __LF \ + ldp x20, x21, [x19, #48] __LF \ + csel x6, x20, x6, eq __LF \ + csel x7, x21, x7, eq __LF \ + ldp x20, x21, [x19, #64] __LF \ + csel x8, x20, x8, eq __LF \ + csel x9, x21, x9, eq __LF \ + ldp x20, x21, [x19, #80] __LF \ + csel x10, x20, x10, eq __LF \ + csel x11, x21, x11, eq __LF \ + ldp x20, x21, [x19, #96] __LF \ + csel x12, x20, x12, eq __LF \ + csel x13, x21, x13, eq __LF \ + ldp x20, x21, [x19, #112] __LF \ + csel x14, x20, x14, eq __LF \ + csel x15, x21, x15, eq __LF \ + ldp x20, x21, [x19, #128] __LF \ + csel x16, x20, x16, eq __LF \ + csel x17, x21, x17, eq __LF \ + add x19, x19, #JACSIZE + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(p384_montjscalarmul): + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x30, [sp, #-16]! + sub sp, sp, NSPACE + +// Preserve the "res" input argument; others get processed early. + + mov res, x0 + +// Reduce the input scalar mod n_384, i.e. conditionally subtract n_384. +// Store it to "scalarb". + + ldp x3, x4, [x1] + movbig(x15, #0xecec, #0x196a, #0xccc5, #0x2973) + ldp x5, x6, [x1, #16] + movbig(x16, #0x581a, #0x0db2, #0x48b0, #0xa77a) + ldp x7, x8, [x1, #32] + movbig(x17, #0xc763, #0x4d81, #0xf437, #0x2ddf) + + subs x9, x3, x15 + sbcs x10, x4, x16 + sbcs x11, x5, x17 + adcs x12, x6, xzr + adcs x13, x7, xzr + adcs x14, x8, xzr + + csel x3, x3, x9, cc + csel x4, x4, x10, cc + csel x5, x5, x11, cc + csel x6, x6, x12, cc + csel x7, x7, x13, cc + csel x8, x8, x14, cc + + stp x3, x4, [scalarb] + stp x5, x6, [scalarb+16] + stp x7, x8, [scalarb+32] + +// Set the tab[0] table entry to the input point = 1 * P + + ldp x10, x11, [x2] + stp x10, x11, [tab] + ldp x12, x13, [x2, #16] + stp x12, x13, [tab+16] + ldp x14, x15, [x2, #32] + stp x14, x15, [tab+32] + + ldp x10, x11, [x2, #48] + stp x10, x11, [tab+48] + ldp x12, x13, [x2, #64] + stp x12, x13, [tab+64] + ldp x14, x15, [x2, #80] + stp x14, x15, [tab+80] + + ldp x10, x11, [x2, #96] + stp x10, x11, [tab+96] + ldp x12, x13, [x2, #112] + stp x12, x13, [tab+112] + ldp x14, x15, [x2, #128] + stp x14, x15, [tab+128] + +// Compute and record tab[1] = 2 * p, ..., tab[15] = 16 * P + + add x0, tab+JACSIZE*1 + add x1, tab + bl p384_montjscalarmul_p384_montjdouble + + add x0, tab+JACSIZE*2 + add x1, tab+JACSIZE*1 + add x2, tab + bl p384_montjscalarmul_p384_montjadd + + add x0, tab+JACSIZE*3 + add x1, tab+JACSIZE*1 + bl p384_montjscalarmul_p384_montjdouble + + add x0, tab+JACSIZE*4 + add x1, tab+JACSIZE*3 + add x2, tab + bl p384_montjscalarmul_p384_montjadd + + add x0, tab+JACSIZE*5 + add x1, tab+JACSIZE*2 + bl p384_montjscalarmul_p384_montjdouble + + add x0, tab+JACSIZE*6 + add x1, tab+JACSIZE*5 + add x2, tab + bl p384_montjscalarmul_p384_montjadd + + add x0, tab+JACSIZE*7 + add x1, tab+JACSIZE*3 + bl p384_montjscalarmul_p384_montjdouble + + add x0, tab+JACSIZE*8 + add x1, tab+JACSIZE*7 + add x2, tab + bl p384_montjscalarmul_p384_montjadd + + add x0, tab+JACSIZE*9 + add x1, tab+JACSIZE*4 + bl p384_montjscalarmul_p384_montjdouble + + add x0, tab+JACSIZE*10 + add x1, tab+JACSIZE*9 + add x2, tab + bl p384_montjscalarmul_p384_montjadd + + add x0, tab+JACSIZE*11 + add x1, tab+JACSIZE*5 + bl p384_montjscalarmul_p384_montjdouble + + add x0, tab+JACSIZE*12 + add x1, tab+JACSIZE*11 + add x2, tab + bl p384_montjscalarmul_p384_montjadd + + add x0, tab+JACSIZE*13 + add x1, tab+JACSIZE*6 + bl p384_montjscalarmul_p384_montjdouble + + add x0, tab+JACSIZE*14 + add x1, tab+JACSIZE*13 + add x2, tab + bl p384_montjscalarmul_p384_montjadd + + add x0, tab+JACSIZE*15 + add x1, tab+JACSIZE*7 + bl p384_montjscalarmul_p384_montjdouble + +// Add the recoding constant sum_i(16 * 32^i) to the scalar to allow signed +// digits. The digits of the constant, in lowest-to-highest order, are as +// follows; they are generated dynamically since none is a simple ARM load. +// +// 0x0842108421084210 +// 0x1084210842108421 +// 0x2108421084210842 +// 0x4210842108421084 +// 0x8421084210842108 +// 0x0842108421084210 + + ldp x0, x1, [scalarb] + ldp x2, x3, [scalarb+16] + ldp x4, x5, [scalarb+32] + movbig(x8, #0x1084, #0x2108, #0x4210, #0x8421) + adds x0, x0, x8, lsr #1 + adcs x1, x1, x8 + lsl x8, x8, #1 + adcs x2, x2, x8 + lsl x8, x8, #1 + adcs x3, x3, x8 + lsl x8, x8, #1 + adcs x4, x4, x8 + lsr x8, x8, #4 + adcs x5, x5, x8 + cset x6, cs + +// Record the top bitfield then shift the whole scalar left 4 bits +// to align the top of the next bitfield with the MSB (bits 379..383). + + extr bf, x6, x5, #60 + extr x5, x5, x4, #60 + extr x4, x4, x3, #60 + extr x3, x3, x2, #60 + extr x2, x2, x1, #60 + extr x1, x1, x0, #60 + lsl x0, x0, #4 + stp x0, x1, [scalarb] + stp x2, x3, [scalarb+16] + stp x4, x5, [scalarb+32] + +// Initialize the accumulator to the corresponding entry using constant-time +// lookup in the table. This top digit, uniquely, is not recoded so there is +// no sign adjustment to make. + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + mov x12, xzr + mov x13, xzr + mov x14, xzr + mov x15, xzr + mov x16, xzr + mov x17, xzr + + add x19, tab + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + selectblock(9) + selectblock(10) + selectblock(11) + selectblock(12) + selectblock(13) + selectblock(14) + selectblock(15) + selectblock(16) + + stp x0, x1, [acc] + stp x2, x3, [acc+16] + stp x4, x5, [acc+32] + stp x6, x7, [acc+48] + stp x8, x9, [acc+64] + stp x10, x11, [acc+80] + stp x12, x13, [acc+96] + stp x14, x15, [acc+112] + stp x16, x17, [acc+128] + + mov j, #380 + +// Main loop over size-5 bitfields: double 5 times then add signed digit +// At each stage we shift the scalar left by 5 bits so we can simply pick +// the top 5 bits as the bitfield, saving some fiddle over indexing. + +p384_montjscalarmul_mainloop: + sub j, j, #5 + + add x0, acc + add x1, acc + bl p384_montjscalarmul_p384_montjdouble + + add x0, acc + add x1, acc + bl p384_montjscalarmul_p384_montjdouble + + add x0, acc + add x1, acc + bl p384_montjscalarmul_p384_montjdouble + + add x0, acc + add x1, acc + bl p384_montjscalarmul_p384_montjdouble + + add x0, acc + add x1, acc + bl p384_montjscalarmul_p384_montjdouble + +// Choose the bitfield and adjust it to sign and magnitude + + ldp x0, x1, [scalarb] + ldp x2, x3, [scalarb+16] + ldp x4, x5, [scalarb+32] + lsr bf, x5, #59 + extr x5, x5, x4, #59 + extr x4, x4, x3, #59 + extr x3, x3, x2, #59 + extr x2, x2, x1, #59 + extr x1, x1, x0, #59 + lsl x0, x0, #5 + stp x0, x1, [scalarb] + stp x2, x3, [scalarb+16] + stp x4, x5, [scalarb+32] + + subs bf, bf, #16 + cset sgn, lo // sgn = sign of digit (1 = negative) + cneg bf, bf, lo // bf = absolute value of digit + +// Conditionally select the table entry tab[i-1] = i * P in constant time + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + mov x12, xzr + mov x13, xzr + mov x14, xzr + mov x15, xzr + mov x16, xzr + mov x17, xzr + + add x19, tab + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + selectblock(9) + selectblock(10) + selectblock(11) + selectblock(12) + selectblock(13) + selectblock(14) + selectblock(15) + selectblock(16) + +// Store it to "tabent" with the y coordinate optionally negated. +// This is done carefully to give coordinates < p_384 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + stp x4, x5, [tabent+32] + + stp x12, x13, [tabent+96] + stp x14, x15, [tabent+112] + stp x16, x17, [tabent+128] + + mov x0, #0x00000000ffffffff + subs x0, x0, x6 + orr x12, x6, x7 + mov x1, #0xffffffff00000000 + sbcs x1, x1, x7 + orr x13, x8, x9 + mov x2, #0xfffffffffffffffe + sbcs x2, x2, x8 + orr x14, x10, x11 + mov x5, #0xffffffffffffffff + sbcs x3, x5, x9 + orr x12, x12, x13 + sbcs x4, x5, x10 + orr x12, x12, x14 + sbcs x5, x5, x11 + + cmp sgn, xzr + ccmp x12, xzr, #4, ne + + csel x6, x0, x6, ne + csel x7, x1, x7, ne + csel x8, x2, x8, ne + csel x9, x3, x9, ne + csel x10, x4, x10, ne + csel x11, x5, x11, ne + + stp x6, x7, [tabent+48] + stp x8, x9, [tabent+64] + stp x10, x11, [tabent+80] + +// Add to the accumulator + + add x0, acc + add x1, acc + add x2, tabent + bl p384_montjscalarmul_p384_montjadd + + cbnz j, p384_montjscalarmul_mainloop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + ldp x0, x1, [acc] + stp x0, x1, [res] + ldp x0, x1, [acc+16] + stp x0, x1, [res, #16] + ldp x0, x1, [acc+32] + stp x0, x1, [res, #32] + ldp x0, x1, [acc+48] + stp x0, x1, [res, #48] + ldp x0, x1, [acc+64] + stp x0, x1, [res, #64] + ldp x0, x1, [acc+80] + stp x0, x1, [res, #80] + ldp x0, x1, [acc+96] + stp x0, x1, [res, #96] + ldp x0, x1, [acc+112] + stp x0, x1, [res, #112] + ldp x0, x1, [acc+128] + stp x0, x1, [res, #128] + +// Restore stack and registers and return + + add sp, sp, NSPACE + ldp x25, x30, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// Local copies of subroutines, complete clones at the moment + +p384_montjscalarmul_p384_montjadd: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, xzr, [sp, #-16]! + sub sp, sp, #0x180 + mov x24, x0 + mov x25, x1 + mov x26, x2 + mov x0, sp + ldr q1, [x25, #96] + ldp x9, x2, [x25, #96] + ldr q0, [x25, #96] + ldp x4, x6, [x25, #112] + rev64 v21.4s, v1.4s + uzp2 v28.4s, v1.4s, v1.4s + umulh x7, x9, x2 + xtn v17.2s, v1.2d + mul v27.4s, v21.4s, v0.4s + ldr q20, [x25, #128] + xtn v30.2s, v0.2d + ldr q1, [x25, #128] + uzp2 v31.4s, v0.4s, v0.4s + ldp x5, x10, [x25, #128] + umulh x8, x9, x4 + uaddlp v3.2d, v27.4s + umull v16.2d, v30.2s, v17.2s + mul x16, x9, x4 + umull v27.2d, v30.2s, v28.2s + shrn v0.2s, v20.2d, #32 + xtn v7.2s, v20.2d + shl v20.2d, v3.2d, #32 + umull v3.2d, v31.2s, v28.2s + mul x3, x2, x4 + umlal v20.2d, v30.2s, v17.2s + umull v22.2d, v7.2s, v0.2s + usra v27.2d, v16.2d, #32 + umulh x11, x2, x4 + movi v21.2d, #0xffffffff + uzp2 v28.4s, v1.4s, v1.4s + adds x15, x16, x7 + and v5.16b, v27.16b, v21.16b + adcs x3, x3, x8 + usra v3.2d, v27.2d, #32 + dup v29.2d, x6 + adcs x16, x11, xzr + mov x14, v20.d[0] + umlal v5.2d, v31.2s, v17.2s + mul x8, x9, x2 + mov x7, v20.d[1] + shl v19.2d, v22.2d, #33 + xtn v25.2s, v29.2d + rev64 v31.4s, v1.4s + lsl x13, x14, #32 + uzp2 v6.4s, v29.4s, v29.4s + umlal v19.2d, v7.2s, v7.2s + usra v3.2d, v5.2d, #32 + adds x1, x8, x8 + umulh x8, x4, x4 + add x12, x13, x14 + mul v17.4s, v31.4s, v29.4s + xtn v4.2s, v1.2d + adcs x14, x15, x15 + lsr x13, x12, #32 + adcs x15, x3, x3 + umull v31.2d, v25.2s, v28.2s + adcs x11, x16, x16 + umull v21.2d, v25.2s, v4.2s + mov x17, v3.d[0] + umull v18.2d, v6.2s, v28.2s + adc x16, x8, xzr + uaddlp v16.2d, v17.4s + movi v1.2d, #0xffffffff + subs x13, x13, x12 + usra v31.2d, v21.2d, #32 + sbc x8, x12, xzr + adds x17, x17, x1 + mul x1, x4, x4 + shl v28.2d, v16.2d, #32 + mov x3, v3.d[1] + adcs x14, x7, x14 + extr x7, x8, x13, #32 + adcs x13, x3, x15 + and v3.16b, v31.16b, v1.16b + adcs x11, x1, x11 + lsr x1, x8, #32 + umlal v3.2d, v6.2s, v4.2s + usra v18.2d, v31.2d, #32 + adc x3, x16, xzr + adds x1, x1, x12 + umlal v28.2d, v25.2s, v4.2s + adc x16, xzr, xzr + subs x15, x17, x7 + sbcs x7, x14, x1 + lsl x1, x15, #32 + sbcs x16, x13, x16 + add x8, x1, x15 + usra v18.2d, v3.2d, #32 + sbcs x14, x11, xzr + lsr x1, x8, #32 + sbcs x17, x3, xzr + sbc x11, x12, xzr + subs x13, x1, x8 + umulh x12, x4, x10 + sbc x1, x8, xzr + extr x13, x1, x13, #32 + lsr x1, x1, #32 + adds x15, x1, x8 + adc x1, xzr, xzr + subs x7, x7, x13 + sbcs x13, x16, x15 + lsl x3, x7, #32 + umulh x16, x2, x5 + sbcs x15, x14, x1 + add x7, x3, x7 + sbcs x3, x17, xzr + lsr x1, x7, #32 + sbcs x14, x11, xzr + sbc x11, x8, xzr + subs x8, x1, x7 + sbc x1, x7, xzr + extr x8, x1, x8, #32 + lsr x1, x1, #32 + adds x1, x1, x7 + adc x17, xzr, xzr + subs x13, x13, x8 + umulh x8, x9, x6 + sbcs x1, x15, x1 + sbcs x15, x3, x17 + sbcs x3, x14, xzr + mul x17, x2, x5 + sbcs x11, x11, xzr + stp x13, x1, [x0] + sbc x14, x7, xzr + mul x7, x4, x10 + subs x1, x9, x2 + stp x15, x3, [x0, #16] + csetm x15, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + stp x11, x14, [x0, #32] + mul x14, x9, x6 + adds x17, x8, x17 + adcs x7, x16, x7 + adc x13, x12, xzr + subs x12, x5, x6 + cneg x3, x12, cc // cc = lo, ul, last + cinv x16, x15, cc // cc = lo, ul, last + mul x8, x1, x3 + umulh x1, x1, x3 + eor x12, x8, x16 + adds x11, x17, x14 + adcs x3, x7, x17 + adcs x15, x13, x7 + adc x8, x13, xzr + adds x3, x3, x14 + adcs x15, x15, x17 + adcs x17, x8, x7 + eor x1, x1, x16 + adc x13, x13, xzr + subs x9, x9, x4 + csetm x8, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x4, x2, x4 + cneg x4, x4, cc // cc = lo, ul, last + csetm x7, cc // cc = lo, ul, last + subs x2, x10, x6 + cinv x8, x8, cc // cc = lo, ul, last + cneg x2, x2, cc // cc = lo, ul, last + cmn x16, #0x1 + adcs x11, x11, x12 + mul x12, x9, x2 + adcs x3, x3, x1 + adcs x15, x15, x16 + umulh x9, x9, x2 + adcs x17, x17, x16 + adc x13, x13, x16 + subs x1, x10, x5 + cinv x2, x7, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + eor x9, x9, x8 + cmn x8, #0x1 + eor x7, x12, x8 + mul x12, x4, x1 + adcs x3, x3, x7 + adcs x7, x15, x9 + adcs x15, x17, x8 + ldp x9, x17, [x0, #16] + umulh x4, x4, x1 + adc x8, x13, x8 + cmn x2, #0x1 + eor x1, x12, x2 + adcs x1, x7, x1 + ldp x7, x16, [x0] + eor x12, x4, x2 + adcs x4, x15, x12 + ldp x15, x12, [x0, #32] + adc x8, x8, x2 + adds x13, x14, x14 + umulh x14, x5, x10 + adcs x2, x11, x11 + adcs x3, x3, x3 + adcs x1, x1, x1 + adcs x4, x4, x4 + adcs x11, x8, x8 + adc x8, xzr, xzr + adds x13, x13, x7 + adcs x2, x2, x16 + mul x16, x5, x10 + adcs x3, x3, x9 + adcs x1, x1, x17 + umulh x5, x5, x5 + lsl x9, x13, #32 + add x9, x9, x13 + adcs x4, x4, x15 + mov x13, v28.d[1] + adcs x15, x11, x12 + lsr x7, x9, #32 + adc x11, x8, xzr + subs x7, x7, x9 + umulh x10, x10, x10 + sbc x17, x9, xzr + extr x7, x17, x7, #32 + lsr x17, x17, #32 + adds x17, x17, x9 + adc x12, xzr, xzr + subs x8, x2, x7 + sbcs x17, x3, x17 + lsl x7, x8, #32 + sbcs x2, x1, x12 + add x3, x7, x8 + sbcs x12, x4, xzr + lsr x1, x3, #32 + sbcs x7, x15, xzr + sbc x15, x9, xzr + subs x1, x1, x3 + sbc x4, x3, xzr + lsr x9, x4, #32 + extr x8, x4, x1, #32 + adds x9, x9, x3 + adc x4, xzr, xzr + subs x1, x17, x8 + lsl x17, x1, #32 + sbcs x8, x2, x9 + sbcs x9, x12, x4 + add x17, x17, x1 + mov x1, v18.d[1] + lsr x2, x17, #32 + sbcs x7, x7, xzr + mov x12, v18.d[0] + sbcs x15, x15, xzr + sbc x3, x3, xzr + subs x4, x2, x17 + sbc x2, x17, xzr + adds x12, x13, x12 + adcs x16, x16, x1 + lsr x13, x2, #32 + extr x1, x2, x4, #32 + adc x2, x14, xzr + adds x4, x13, x17 + mul x13, x6, x6 + adc x14, xzr, xzr + subs x1, x8, x1 + sbcs x4, x9, x4 + mov x9, v28.d[0] + sbcs x7, x7, x14 + sbcs x8, x15, xzr + sbcs x3, x3, xzr + sbc x14, x17, xzr + adds x17, x9, x9 + adcs x12, x12, x12 + mov x15, v19.d[0] + adcs x9, x16, x16 + umulh x6, x6, x6 + adcs x16, x2, x2 + adc x2, xzr, xzr + adds x11, x11, x8 + adcs x3, x3, xzr + adcs x14, x14, xzr + adcs x8, xzr, xzr + adds x13, x1, x13 + mov x1, v19.d[1] + adcs x6, x4, x6 + mov x4, #0xffffffff // #4294967295 + adcs x15, x7, x15 + adcs x7, x11, x5 + adcs x1, x3, x1 + adcs x14, x14, x10 + adc x11, x8, xzr + adds x6, x6, x17 + adcs x8, x15, x12 + adcs x3, x7, x9 + adcs x15, x1, x16 + mov x16, #0xffffffff00000001 // #-4294967295 + adcs x14, x14, x2 + mov x2, #0x1 // #1 + adc x17, x11, xzr + cmn x13, x16 + adcs xzr, x6, x4 + adcs xzr, x8, x2 + adcs xzr, x3, xzr + adcs xzr, x15, xzr + adcs xzr, x14, xzr + adc x1, x17, xzr + neg x9, x1 + and x1, x16, x9 + adds x11, x13, x1 + and x13, x4, x9 + adcs x5, x6, x13 + and x1, x2, x9 + adcs x7, x8, x1 + stp x11, x5, [x0] + adcs x11, x3, xzr + adcs x2, x15, xzr + stp x7, x11, [x0, #16] + adc x17, x14, xzr + stp x2, x17, [x0, #32] + ldr q1, [x26, #96] + ldp x9, x2, [x26, #96] + ldr q0, [x26, #96] + ldp x4, x6, [x26, #112] + rev64 v21.4s, v1.4s + uzp2 v28.4s, v1.4s, v1.4s + umulh x7, x9, x2 + xtn v17.2s, v1.2d + mul v27.4s, v21.4s, v0.4s + ldr q20, [x26, #128] + xtn v30.2s, v0.2d + ldr q1, [x26, #128] + uzp2 v31.4s, v0.4s, v0.4s + ldp x5, x10, [x26, #128] + umulh x8, x9, x4 + uaddlp v3.2d, v27.4s + umull v16.2d, v30.2s, v17.2s + mul x16, x9, x4 + umull v27.2d, v30.2s, v28.2s + shrn v0.2s, v20.2d, #32 + xtn v7.2s, v20.2d + shl v20.2d, v3.2d, #32 + umull v3.2d, v31.2s, v28.2s + mul x3, x2, x4 + umlal v20.2d, v30.2s, v17.2s + umull v22.2d, v7.2s, v0.2s + usra v27.2d, v16.2d, #32 + umulh x11, x2, x4 + movi v21.2d, #0xffffffff + uzp2 v28.4s, v1.4s, v1.4s + adds x15, x16, x7 + and v5.16b, v27.16b, v21.16b + adcs x3, x3, x8 + usra v3.2d, v27.2d, #32 + dup v29.2d, x6 + adcs x16, x11, xzr + mov x14, v20.d[0] + umlal v5.2d, v31.2s, v17.2s + mul x8, x9, x2 + mov x7, v20.d[1] + shl v19.2d, v22.2d, #33 + xtn v25.2s, v29.2d + rev64 v31.4s, v1.4s + lsl x13, x14, #32 + uzp2 v6.4s, v29.4s, v29.4s + umlal v19.2d, v7.2s, v7.2s + usra v3.2d, v5.2d, #32 + adds x1, x8, x8 + umulh x8, x4, x4 + add x12, x13, x14 + mul v17.4s, v31.4s, v29.4s + xtn v4.2s, v1.2d + adcs x14, x15, x15 + lsr x13, x12, #32 + adcs x15, x3, x3 + umull v31.2d, v25.2s, v28.2s + adcs x11, x16, x16 + umull v21.2d, v25.2s, v4.2s + mov x17, v3.d[0] + umull v18.2d, v6.2s, v28.2s + adc x16, x8, xzr + uaddlp v16.2d, v17.4s + movi v1.2d, #0xffffffff + subs x13, x13, x12 + usra v31.2d, v21.2d, #32 + sbc x8, x12, xzr + adds x17, x17, x1 + mul x1, x4, x4 + shl v28.2d, v16.2d, #32 + mov x3, v3.d[1] + adcs x14, x7, x14 + extr x7, x8, x13, #32 + adcs x13, x3, x15 + and v3.16b, v31.16b, v1.16b + adcs x11, x1, x11 + lsr x1, x8, #32 + umlal v3.2d, v6.2s, v4.2s + usra v18.2d, v31.2d, #32 + adc x3, x16, xzr + adds x1, x1, x12 + umlal v28.2d, v25.2s, v4.2s + adc x16, xzr, xzr + subs x15, x17, x7 + sbcs x7, x14, x1 + lsl x1, x15, #32 + sbcs x16, x13, x16 + add x8, x1, x15 + usra v18.2d, v3.2d, #32 + sbcs x14, x11, xzr + lsr x1, x8, #32 + sbcs x17, x3, xzr + sbc x11, x12, xzr + subs x13, x1, x8 + umulh x12, x4, x10 + sbc x1, x8, xzr + extr x13, x1, x13, #32 + lsr x1, x1, #32 + adds x15, x1, x8 + adc x1, xzr, xzr + subs x7, x7, x13 + sbcs x13, x16, x15 + lsl x3, x7, #32 + umulh x16, x2, x5 + sbcs x15, x14, x1 + add x7, x3, x7 + sbcs x3, x17, xzr + lsr x1, x7, #32 + sbcs x14, x11, xzr + sbc x11, x8, xzr + subs x8, x1, x7 + sbc x1, x7, xzr + extr x8, x1, x8, #32 + lsr x1, x1, #32 + adds x1, x1, x7 + adc x17, xzr, xzr + subs x13, x13, x8 + umulh x8, x9, x6 + sbcs x1, x15, x1 + sbcs x15, x3, x17 + sbcs x3, x14, xzr + mul x17, x2, x5 + sbcs x11, x11, xzr + stp x13, x1, [sp, #240] + sbc x14, x7, xzr + mul x7, x4, x10 + subs x1, x9, x2 + stp x15, x3, [sp, #256] + csetm x15, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + stp x11, x14, [sp, #272] + mul x14, x9, x6 + adds x17, x8, x17 + adcs x7, x16, x7 + adc x13, x12, xzr + subs x12, x5, x6 + cneg x3, x12, cc // cc = lo, ul, last + cinv x16, x15, cc // cc = lo, ul, last + mul x8, x1, x3 + umulh x1, x1, x3 + eor x12, x8, x16 + adds x11, x17, x14 + adcs x3, x7, x17 + adcs x15, x13, x7 + adc x8, x13, xzr + adds x3, x3, x14 + adcs x15, x15, x17 + adcs x17, x8, x7 + eor x1, x1, x16 + adc x13, x13, xzr + subs x9, x9, x4 + csetm x8, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x4, x2, x4 + cneg x4, x4, cc // cc = lo, ul, last + csetm x7, cc // cc = lo, ul, last + subs x2, x10, x6 + cinv x8, x8, cc // cc = lo, ul, last + cneg x2, x2, cc // cc = lo, ul, last + cmn x16, #0x1 + adcs x11, x11, x12 + mul x12, x9, x2 + adcs x3, x3, x1 + adcs x15, x15, x16 + umulh x9, x9, x2 + adcs x17, x17, x16 + adc x13, x13, x16 + subs x1, x10, x5 + cinv x2, x7, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + eor x9, x9, x8 + cmn x8, #0x1 + eor x7, x12, x8 + mul x12, x4, x1 + adcs x3, x3, x7 + adcs x7, x15, x9 + adcs x15, x17, x8 + ldp x9, x17, [sp, #256] + umulh x4, x4, x1 + adc x8, x13, x8 + cmn x2, #0x1 + eor x1, x12, x2 + adcs x1, x7, x1 + ldp x7, x16, [sp, #240] + eor x12, x4, x2 + adcs x4, x15, x12 + ldp x15, x12, [sp, #272] + adc x8, x8, x2 + adds x13, x14, x14 + umulh x14, x5, x10 + adcs x2, x11, x11 + adcs x3, x3, x3 + adcs x1, x1, x1 + adcs x4, x4, x4 + adcs x11, x8, x8 + adc x8, xzr, xzr + adds x13, x13, x7 + adcs x2, x2, x16 + mul x16, x5, x10 + adcs x3, x3, x9 + adcs x1, x1, x17 + umulh x5, x5, x5 + lsl x9, x13, #32 + add x9, x9, x13 + adcs x4, x4, x15 + mov x13, v28.d[1] + adcs x15, x11, x12 + lsr x7, x9, #32 + adc x11, x8, xzr + subs x7, x7, x9 + umulh x10, x10, x10 + sbc x17, x9, xzr + extr x7, x17, x7, #32 + lsr x17, x17, #32 + adds x17, x17, x9 + adc x12, xzr, xzr + subs x8, x2, x7 + sbcs x17, x3, x17 + lsl x7, x8, #32 + sbcs x2, x1, x12 + add x3, x7, x8 + sbcs x12, x4, xzr + lsr x1, x3, #32 + sbcs x7, x15, xzr + sbc x15, x9, xzr + subs x1, x1, x3 + sbc x4, x3, xzr + lsr x9, x4, #32 + extr x8, x4, x1, #32 + adds x9, x9, x3 + adc x4, xzr, xzr + subs x1, x17, x8 + lsl x17, x1, #32 + sbcs x8, x2, x9 + sbcs x9, x12, x4 + add x17, x17, x1 + mov x1, v18.d[1] + lsr x2, x17, #32 + sbcs x7, x7, xzr + mov x12, v18.d[0] + sbcs x15, x15, xzr + sbc x3, x3, xzr + subs x4, x2, x17 + sbc x2, x17, xzr + adds x12, x13, x12 + adcs x16, x16, x1 + lsr x13, x2, #32 + extr x1, x2, x4, #32 + adc x2, x14, xzr + adds x4, x13, x17 + mul x13, x6, x6 + adc x14, xzr, xzr + subs x1, x8, x1 + sbcs x4, x9, x4 + mov x9, v28.d[0] + sbcs x7, x7, x14 + sbcs x8, x15, xzr + sbcs x3, x3, xzr + sbc x14, x17, xzr + adds x17, x9, x9 + adcs x12, x12, x12 + mov x15, v19.d[0] + adcs x9, x16, x16 + umulh x6, x6, x6 + adcs x16, x2, x2 + adc x2, xzr, xzr + adds x11, x11, x8 + adcs x3, x3, xzr + adcs x14, x14, xzr + adcs x8, xzr, xzr + adds x13, x1, x13 + mov x1, v19.d[1] + adcs x6, x4, x6 + mov x4, #0xffffffff // #4294967295 + adcs x15, x7, x15 + adcs x7, x11, x5 + adcs x1, x3, x1 + adcs x14, x14, x10 + adc x11, x8, xzr + adds x6, x6, x17 + adcs x8, x15, x12 + adcs x3, x7, x9 + adcs x15, x1, x16 + mov x16, #0xffffffff00000001 // #-4294967295 + adcs x14, x14, x2 + mov x2, #0x1 // #1 + adc x17, x11, xzr + cmn x13, x16 + adcs xzr, x6, x4 + adcs xzr, x8, x2 + adcs xzr, x3, xzr + adcs xzr, x15, xzr + adcs xzr, x14, xzr + adc x1, x17, xzr + neg x9, x1 + and x1, x16, x9 + adds x11, x13, x1 + and x13, x4, x9 + adcs x5, x6, x13 + and x1, x2, x9 + adcs x7, x8, x1 + stp x11, x5, [sp, #240] + adcs x11, x3, xzr + adcs x2, x15, xzr + stp x7, x11, [sp, #256] + adc x17, x14, xzr + stp x2, x17, [sp, #272] + stp x23, x24, [sp, #0x150] + ldr q3, [x26, #96] + ldr q25, [x25, #48] + ldp x13, x23, [x25, #48] + ldp x3, x21, [x26, #96] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [x25, #80] + ldp x8, x24, [x26, #112] + subs x6, x3, x21 + ldr q0, [x26, #128] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [x25, #64] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [x25, #80] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [x26, #128] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #288] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #304] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #320] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #288] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #304] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #320] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #288] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #304] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #320] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #288] + ldp x21, x12, [sp, #304] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #320] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [sp, #288] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [sp, #304] + adc x12, x15, x23 + stp x21, x12, [sp, #320] + ldr q3, [x25, #96] + ldr q25, [x26, #48] + ldp x13, x23, [x26, #48] + ldp x3, x21, [x25, #96] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [x26, #80] + ldp x8, x24, [x25, #112] + subs x6, x3, x21 + ldr q0, [x25, #128] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [x26, #64] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [x26, #80] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [x25, #128] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #48] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #64] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #80] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #48] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #64] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #80] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #48] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #64] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #80] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #48] + ldp x21, x12, [sp, #64] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #80] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [sp, #48] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [sp, #64] + adc x12, x15, x23 + stp x21, x12, [sp, #80] + mov x1, sp + ldr q3, [x1] + ldr q25, [x26] + ldp x13, x23, [x26] + ldp x3, x21, [x1] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [x26, #32] + ldp x8, x24, [x1, #16] + subs x6, x3, x21 + ldr q0, [x1, #32] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [x26, #16] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [x26, #32] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [x1, #32] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #96] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #112] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #128] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #96] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #112] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #128] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #96] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #112] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #128] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #96] + ldp x21, x12, [sp, #112] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #128] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [sp, #96] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [sp, #112] + adc x12, x15, x23 + stp x21, x12, [sp, #128] + ldr q3, [sp, #240] + ldr q25, [x25] + ldp x13, x23, [x25] + ldp x3, x21, [sp, #240] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [x25, #32] + ldp x8, x24, [sp, #256] + subs x6, x3, x21 + ldr q0, [sp, #272] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [x25, #16] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [x25, #32] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [sp, #272] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #192] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #208] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #224] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #192] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #208] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #224] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #192] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #208] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #224] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #192] + ldp x21, x12, [sp, #208] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #224] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [sp, #192] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [sp, #208] + adc x12, x15, x23 + stp x21, x12, [sp, #224] + mov x1, sp + ldr q3, [x1] + ldr q25, [sp, #48] + ldp x13, x23, [sp, #48] + ldp x3, x21, [x1] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [sp, #80] + ldp x8, x24, [x1, #16] + subs x6, x3, x21 + ldr q0, [x1, #32] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [sp, #64] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [sp, #80] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [x1, #32] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #48] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #64] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #80] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #48] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #64] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #80] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #48] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #64] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #80] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #48] + ldp x21, x12, [sp, #64] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #80] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [sp, #48] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [sp, #64] + adc x12, x15, x23 + stp x21, x12, [sp, #80] + ldr q3, [sp, #240] + ldr q25, [sp, #288] + ldp x13, x23, [sp, #288] + ldp x3, x21, [sp, #240] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [sp, #320] + ldp x8, x24, [sp, #256] + subs x6, x3, x21 + ldr q0, [sp, #272] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [sp, #304] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [sp, #320] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [sp, #272] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #288] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #304] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #320] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #288] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #304] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #320] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #288] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #304] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #320] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #288] + ldp x21, x12, [sp, #304] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #320] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x2, x24, x11 + stp x22, x5, [sp, #288] + adcs x11, x13, x23 + adcs x12, x8, x23 + stp x2, x11, [sp, #304] + adc x13, x15, x23 + stp x12, x13, [sp, #320] + ldp x5, x6, [sp, #96] + ldp x4, x3, [sp, #192] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #112] + ldp x4, x3, [sp, #208] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [sp, #128] + ldp x4, x3, [sp, #224] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + csetm x3, cc // cc = lo, ul, last + mov x4, #0xffffffff // #4294967295 + and x4, x4, x3 + adds x5, x5, x4 + eor x4, x4, x3 + adcs x6, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + and x4, x4, x3 + adcs x7, x7, x4 + adcs x8, x8, x3 + adcs x9, x9, x3 + adc x10, x10, x3 + stp x5, x6, [sp, #240] + stp x7, x8, [sp, #256] + stp x9, x10, [sp, #272] + ldp x5, x6, [sp, #48] + ldp x4, x3, [sp, #288] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #64] + sbcs x7, x7, x2 + sbcs x8, x8, x11 + ldp x9, x10, [sp, #80] + sbcs x9, x9, x12 + sbcs x10, x10, x13 + csetm x3, cc // cc = lo, ul, last + mov x4, #0xffffffff // #4294967295 + and x4, x4, x3 + adds x5, x5, x4 + eor x4, x4, x3 + adcs x6, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + and x4, x4, x3 + adcs x7, x7, x4 + adcs x8, x8, x3 + adcs x9, x9, x3 + adc x10, x10, x3 + stp x5, x6, [sp, #48] + stp x7, x8, [sp, #64] + stp x9, x10, [sp, #80] + ldr q1, [sp, #240] + ldp x9, x2, [sp, #240] + ldr q0, [sp, #240] + ldp x4, x6, [sp, #256] + rev64 v21.4s, v1.4s + uzp2 v28.4s, v1.4s, v1.4s + umulh x7, x9, x2 + xtn v17.2s, v1.2d + mul v27.4s, v21.4s, v0.4s + ldr q20, [sp, #272] + xtn v30.2s, v0.2d + ldr q1, [sp, #272] + uzp2 v31.4s, v0.4s, v0.4s + ldp x5, x10, [sp, #272] + umulh x8, x9, x4 + uaddlp v3.2d, v27.4s + umull v16.2d, v30.2s, v17.2s + mul x16, x9, x4 + umull v27.2d, v30.2s, v28.2s + shrn v0.2s, v20.2d, #32 + xtn v7.2s, v20.2d + shl v20.2d, v3.2d, #32 + umull v3.2d, v31.2s, v28.2s + mul x3, x2, x4 + umlal v20.2d, v30.2s, v17.2s + umull v22.2d, v7.2s, v0.2s + usra v27.2d, v16.2d, #32 + umulh x11, x2, x4 + movi v21.2d, #0xffffffff + uzp2 v28.4s, v1.4s, v1.4s + adds x15, x16, x7 + and v5.16b, v27.16b, v21.16b + adcs x3, x3, x8 + usra v3.2d, v27.2d, #32 + dup v29.2d, x6 + adcs x16, x11, xzr + mov x14, v20.d[0] + umlal v5.2d, v31.2s, v17.2s + mul x8, x9, x2 + mov x7, v20.d[1] + shl v19.2d, v22.2d, #33 + xtn v25.2s, v29.2d + rev64 v31.4s, v1.4s + lsl x13, x14, #32 + uzp2 v6.4s, v29.4s, v29.4s + umlal v19.2d, v7.2s, v7.2s + usra v3.2d, v5.2d, #32 + adds x1, x8, x8 + umulh x8, x4, x4 + add x12, x13, x14 + mul v17.4s, v31.4s, v29.4s + xtn v4.2s, v1.2d + adcs x14, x15, x15 + lsr x13, x12, #32 + adcs x15, x3, x3 + umull v31.2d, v25.2s, v28.2s + adcs x11, x16, x16 + umull v21.2d, v25.2s, v4.2s + mov x17, v3.d[0] + umull v18.2d, v6.2s, v28.2s + adc x16, x8, xzr + uaddlp v16.2d, v17.4s + movi v1.2d, #0xffffffff + subs x13, x13, x12 + usra v31.2d, v21.2d, #32 + sbc x8, x12, xzr + adds x17, x17, x1 + mul x1, x4, x4 + shl v28.2d, v16.2d, #32 + mov x3, v3.d[1] + adcs x14, x7, x14 + extr x7, x8, x13, #32 + adcs x13, x3, x15 + and v3.16b, v31.16b, v1.16b + adcs x11, x1, x11 + lsr x1, x8, #32 + umlal v3.2d, v6.2s, v4.2s + usra v18.2d, v31.2d, #32 + adc x3, x16, xzr + adds x1, x1, x12 + umlal v28.2d, v25.2s, v4.2s + adc x16, xzr, xzr + subs x15, x17, x7 + sbcs x7, x14, x1 + lsl x1, x15, #32 + sbcs x16, x13, x16 + add x8, x1, x15 + usra v18.2d, v3.2d, #32 + sbcs x14, x11, xzr + lsr x1, x8, #32 + sbcs x17, x3, xzr + sbc x11, x12, xzr + subs x13, x1, x8 + umulh x12, x4, x10 + sbc x1, x8, xzr + extr x13, x1, x13, #32 + lsr x1, x1, #32 + adds x15, x1, x8 + adc x1, xzr, xzr + subs x7, x7, x13 + sbcs x13, x16, x15 + lsl x3, x7, #32 + umulh x16, x2, x5 + sbcs x15, x14, x1 + add x7, x3, x7 + sbcs x3, x17, xzr + lsr x1, x7, #32 + sbcs x14, x11, xzr + sbc x11, x8, xzr + subs x8, x1, x7 + sbc x1, x7, xzr + extr x8, x1, x8, #32 + lsr x1, x1, #32 + adds x1, x1, x7 + adc x17, xzr, xzr + subs x13, x13, x8 + umulh x8, x9, x6 + sbcs x1, x15, x1 + sbcs x15, x3, x17 + sbcs x3, x14, xzr + mul x17, x2, x5 + sbcs x11, x11, xzr + stp x13, x1, [sp, #144] + sbc x14, x7, xzr + mul x7, x4, x10 + subs x1, x9, x2 + stp x15, x3, [sp, #160] + csetm x15, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + stp x11, x14, [sp, #176] + mul x14, x9, x6 + adds x17, x8, x17 + adcs x7, x16, x7 + adc x13, x12, xzr + subs x12, x5, x6 + cneg x3, x12, cc // cc = lo, ul, last + cinv x16, x15, cc // cc = lo, ul, last + mul x8, x1, x3 + umulh x1, x1, x3 + eor x12, x8, x16 + adds x11, x17, x14 + adcs x3, x7, x17 + adcs x15, x13, x7 + adc x8, x13, xzr + adds x3, x3, x14 + adcs x15, x15, x17 + adcs x17, x8, x7 + eor x1, x1, x16 + adc x13, x13, xzr + subs x9, x9, x4 + csetm x8, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x4, x2, x4 + cneg x4, x4, cc // cc = lo, ul, last + csetm x7, cc // cc = lo, ul, last + subs x2, x10, x6 + cinv x8, x8, cc // cc = lo, ul, last + cneg x2, x2, cc // cc = lo, ul, last + cmn x16, #0x1 + adcs x11, x11, x12 + mul x12, x9, x2 + adcs x3, x3, x1 + adcs x15, x15, x16 + umulh x9, x9, x2 + adcs x17, x17, x16 + adc x13, x13, x16 + subs x1, x10, x5 + cinv x2, x7, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + eor x9, x9, x8 + cmn x8, #0x1 + eor x7, x12, x8 + mul x12, x4, x1 + adcs x3, x3, x7 + adcs x7, x15, x9 + adcs x15, x17, x8 + ldp x9, x17, [sp, #160] + umulh x4, x4, x1 + adc x8, x13, x8 + cmn x2, #0x1 + eor x1, x12, x2 + adcs x1, x7, x1 + ldp x7, x16, [sp, #144] + eor x12, x4, x2 + adcs x4, x15, x12 + ldp x15, x12, [sp, #176] + adc x8, x8, x2 + adds x13, x14, x14 + umulh x14, x5, x10 + adcs x2, x11, x11 + adcs x3, x3, x3 + adcs x1, x1, x1 + adcs x4, x4, x4 + adcs x11, x8, x8 + adc x8, xzr, xzr + adds x13, x13, x7 + adcs x2, x2, x16 + mul x16, x5, x10 + adcs x3, x3, x9 + adcs x1, x1, x17 + umulh x5, x5, x5 + lsl x9, x13, #32 + add x9, x9, x13 + adcs x4, x4, x15 + mov x13, v28.d[1] + adcs x15, x11, x12 + lsr x7, x9, #32 + adc x11, x8, xzr + subs x7, x7, x9 + umulh x10, x10, x10 + sbc x17, x9, xzr + extr x7, x17, x7, #32 + lsr x17, x17, #32 + adds x17, x17, x9 + adc x12, xzr, xzr + subs x8, x2, x7 + sbcs x17, x3, x17 + lsl x7, x8, #32 + sbcs x2, x1, x12 + add x3, x7, x8 + sbcs x12, x4, xzr + lsr x1, x3, #32 + sbcs x7, x15, xzr + sbc x15, x9, xzr + subs x1, x1, x3 + sbc x4, x3, xzr + lsr x9, x4, #32 + extr x8, x4, x1, #32 + adds x9, x9, x3 + adc x4, xzr, xzr + subs x1, x17, x8 + lsl x17, x1, #32 + sbcs x8, x2, x9 + sbcs x9, x12, x4 + add x17, x17, x1 + mov x1, v18.d[1] + lsr x2, x17, #32 + sbcs x7, x7, xzr + mov x12, v18.d[0] + sbcs x15, x15, xzr + sbc x3, x3, xzr + subs x4, x2, x17 + sbc x2, x17, xzr + adds x12, x13, x12 + adcs x16, x16, x1 + lsr x13, x2, #32 + extr x1, x2, x4, #32 + adc x2, x14, xzr + adds x4, x13, x17 + mul x13, x6, x6 + adc x14, xzr, xzr + subs x1, x8, x1 + sbcs x4, x9, x4 + mov x9, v28.d[0] + sbcs x7, x7, x14 + sbcs x8, x15, xzr + sbcs x3, x3, xzr + sbc x14, x17, xzr + adds x17, x9, x9 + adcs x12, x12, x12 + mov x15, v19.d[0] + adcs x9, x16, x16 + umulh x6, x6, x6 + adcs x16, x2, x2 + adc x2, xzr, xzr + adds x11, x11, x8 + adcs x3, x3, xzr + adcs x14, x14, xzr + adcs x8, xzr, xzr + adds x13, x1, x13 + mov x1, v19.d[1] + adcs x6, x4, x6 + mov x4, #0xffffffff // #4294967295 + adcs x15, x7, x15 + adcs x7, x11, x5 + adcs x1, x3, x1 + adcs x14, x14, x10 + adc x11, x8, xzr + adds x6, x6, x17 + adcs x8, x15, x12 + adcs x3, x7, x9 + adcs x15, x1, x16 + mov x16, #0xffffffff00000001 // #-4294967295 + adcs x14, x14, x2 + mov x2, #0x1 // #1 + adc x17, x11, xzr + cmn x13, x16 + adcs xzr, x6, x4 + adcs xzr, x8, x2 + adcs xzr, x3, xzr + adcs xzr, x15, xzr + adcs xzr, x14, xzr + adc x1, x17, xzr + neg x9, x1 + and x1, x16, x9 + adds x11, x13, x1 + and x13, x4, x9 + adcs x5, x6, x13 + and x1, x2, x9 + adcs x7, x8, x1 + stp x11, x5, [sp, #144] + adcs x11, x3, xzr + adcs x2, x15, xzr + stp x7, x11, [sp, #160] + adc x17, x14, xzr + stp x2, x17, [sp, #176] + mov x0, sp + ldr q1, [sp, #48] + ldp x9, x2, [sp, #48] + ldr q0, [sp, #48] + ldp x4, x6, [sp, #64] + rev64 v21.4s, v1.4s + uzp2 v28.4s, v1.4s, v1.4s + umulh x7, x9, x2 + xtn v17.2s, v1.2d + mul v27.4s, v21.4s, v0.4s + ldr q20, [sp, #80] + xtn v30.2s, v0.2d + ldr q1, [sp, #80] + uzp2 v31.4s, v0.4s, v0.4s + ldp x5, x10, [sp, #80] + umulh x8, x9, x4 + uaddlp v3.2d, v27.4s + umull v16.2d, v30.2s, v17.2s + mul x16, x9, x4 + umull v27.2d, v30.2s, v28.2s + shrn v0.2s, v20.2d, #32 + xtn v7.2s, v20.2d + shl v20.2d, v3.2d, #32 + umull v3.2d, v31.2s, v28.2s + mul x3, x2, x4 + umlal v20.2d, v30.2s, v17.2s + umull v22.2d, v7.2s, v0.2s + usra v27.2d, v16.2d, #32 + umulh x11, x2, x4 + movi v21.2d, #0xffffffff + uzp2 v28.4s, v1.4s, v1.4s + adds x15, x16, x7 + and v5.16b, v27.16b, v21.16b + adcs x3, x3, x8 + usra v3.2d, v27.2d, #32 + dup v29.2d, x6 + adcs x16, x11, xzr + mov x14, v20.d[0] + umlal v5.2d, v31.2s, v17.2s + mul x8, x9, x2 + mov x7, v20.d[1] + shl v19.2d, v22.2d, #33 + xtn v25.2s, v29.2d + rev64 v31.4s, v1.4s + lsl x13, x14, #32 + uzp2 v6.4s, v29.4s, v29.4s + umlal v19.2d, v7.2s, v7.2s + usra v3.2d, v5.2d, #32 + adds x1, x8, x8 + umulh x8, x4, x4 + add x12, x13, x14 + mul v17.4s, v31.4s, v29.4s + xtn v4.2s, v1.2d + adcs x14, x15, x15 + lsr x13, x12, #32 + adcs x15, x3, x3 + umull v31.2d, v25.2s, v28.2s + adcs x11, x16, x16 + umull v21.2d, v25.2s, v4.2s + mov x17, v3.d[0] + umull v18.2d, v6.2s, v28.2s + adc x16, x8, xzr + uaddlp v16.2d, v17.4s + movi v1.2d, #0xffffffff + subs x13, x13, x12 + usra v31.2d, v21.2d, #32 + sbc x8, x12, xzr + adds x17, x17, x1 + mul x1, x4, x4 + shl v28.2d, v16.2d, #32 + mov x3, v3.d[1] + adcs x14, x7, x14 + extr x7, x8, x13, #32 + adcs x13, x3, x15 + and v3.16b, v31.16b, v1.16b + adcs x11, x1, x11 + lsr x1, x8, #32 + umlal v3.2d, v6.2s, v4.2s + usra v18.2d, v31.2d, #32 + adc x3, x16, xzr + adds x1, x1, x12 + umlal v28.2d, v25.2s, v4.2s + adc x16, xzr, xzr + subs x15, x17, x7 + sbcs x7, x14, x1 + lsl x1, x15, #32 + sbcs x16, x13, x16 + add x8, x1, x15 + usra v18.2d, v3.2d, #32 + sbcs x14, x11, xzr + lsr x1, x8, #32 + sbcs x17, x3, xzr + sbc x11, x12, xzr + subs x13, x1, x8 + umulh x12, x4, x10 + sbc x1, x8, xzr + extr x13, x1, x13, #32 + lsr x1, x1, #32 + adds x15, x1, x8 + adc x1, xzr, xzr + subs x7, x7, x13 + sbcs x13, x16, x15 + lsl x3, x7, #32 + umulh x16, x2, x5 + sbcs x15, x14, x1 + add x7, x3, x7 + sbcs x3, x17, xzr + lsr x1, x7, #32 + sbcs x14, x11, xzr + sbc x11, x8, xzr + subs x8, x1, x7 + sbc x1, x7, xzr + extr x8, x1, x8, #32 + lsr x1, x1, #32 + adds x1, x1, x7 + adc x17, xzr, xzr + subs x13, x13, x8 + umulh x8, x9, x6 + sbcs x1, x15, x1 + sbcs x15, x3, x17 + sbcs x3, x14, xzr + mul x17, x2, x5 + sbcs x11, x11, xzr + stp x13, x1, [x0] + sbc x14, x7, xzr + mul x7, x4, x10 + subs x1, x9, x2 + stp x15, x3, [x0, #16] + csetm x15, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + stp x11, x14, [x0, #32] + mul x14, x9, x6 + adds x17, x8, x17 + adcs x7, x16, x7 + adc x13, x12, xzr + subs x12, x5, x6 + cneg x3, x12, cc // cc = lo, ul, last + cinv x16, x15, cc // cc = lo, ul, last + mul x8, x1, x3 + umulh x1, x1, x3 + eor x12, x8, x16 + adds x11, x17, x14 + adcs x3, x7, x17 + adcs x15, x13, x7 + adc x8, x13, xzr + adds x3, x3, x14 + adcs x15, x15, x17 + adcs x17, x8, x7 + eor x1, x1, x16 + adc x13, x13, xzr + subs x9, x9, x4 + csetm x8, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x4, x2, x4 + cneg x4, x4, cc // cc = lo, ul, last + csetm x7, cc // cc = lo, ul, last + subs x2, x10, x6 + cinv x8, x8, cc // cc = lo, ul, last + cneg x2, x2, cc // cc = lo, ul, last + cmn x16, #0x1 + adcs x11, x11, x12 + mul x12, x9, x2 + adcs x3, x3, x1 + adcs x15, x15, x16 + umulh x9, x9, x2 + adcs x17, x17, x16 + adc x13, x13, x16 + subs x1, x10, x5 + cinv x2, x7, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + eor x9, x9, x8 + cmn x8, #0x1 + eor x7, x12, x8 + mul x12, x4, x1 + adcs x3, x3, x7 + adcs x7, x15, x9 + adcs x15, x17, x8 + ldp x9, x17, [x0, #16] + umulh x4, x4, x1 + adc x8, x13, x8 + cmn x2, #0x1 + eor x1, x12, x2 + adcs x1, x7, x1 + ldp x7, x16, [x0] + eor x12, x4, x2 + adcs x4, x15, x12 + ldp x15, x12, [x0, #32] + adc x8, x8, x2 + adds x13, x14, x14 + umulh x14, x5, x10 + adcs x2, x11, x11 + adcs x3, x3, x3 + adcs x1, x1, x1 + adcs x4, x4, x4 + adcs x11, x8, x8 + adc x8, xzr, xzr + adds x13, x13, x7 + adcs x2, x2, x16 + mul x16, x5, x10 + adcs x3, x3, x9 + adcs x1, x1, x17 + umulh x5, x5, x5 + lsl x9, x13, #32 + add x9, x9, x13 + adcs x4, x4, x15 + mov x13, v28.d[1] + adcs x15, x11, x12 + lsr x7, x9, #32 + adc x11, x8, xzr + subs x7, x7, x9 + umulh x10, x10, x10 + sbc x17, x9, xzr + extr x7, x17, x7, #32 + lsr x17, x17, #32 + adds x17, x17, x9 + adc x12, xzr, xzr + subs x8, x2, x7 + sbcs x17, x3, x17 + lsl x7, x8, #32 + sbcs x2, x1, x12 + add x3, x7, x8 + sbcs x12, x4, xzr + lsr x1, x3, #32 + sbcs x7, x15, xzr + sbc x15, x9, xzr + subs x1, x1, x3 + sbc x4, x3, xzr + lsr x9, x4, #32 + extr x8, x4, x1, #32 + adds x9, x9, x3 + adc x4, xzr, xzr + subs x1, x17, x8 + lsl x17, x1, #32 + sbcs x8, x2, x9 + sbcs x9, x12, x4 + add x17, x17, x1 + mov x1, v18.d[1] + lsr x2, x17, #32 + sbcs x7, x7, xzr + mov x12, v18.d[0] + sbcs x15, x15, xzr + sbc x3, x3, xzr + subs x4, x2, x17 + sbc x2, x17, xzr + adds x12, x13, x12 + adcs x16, x16, x1 + lsr x13, x2, #32 + extr x1, x2, x4, #32 + adc x2, x14, xzr + adds x4, x13, x17 + mul x13, x6, x6 + adc x14, xzr, xzr + subs x1, x8, x1 + sbcs x4, x9, x4 + mov x9, v28.d[0] + sbcs x7, x7, x14 + sbcs x8, x15, xzr + sbcs x3, x3, xzr + sbc x14, x17, xzr + adds x17, x9, x9 + adcs x12, x12, x12 + mov x15, v19.d[0] + adcs x9, x16, x16 + umulh x6, x6, x6 + adcs x16, x2, x2 + adc x2, xzr, xzr + adds x11, x11, x8 + adcs x3, x3, xzr + adcs x14, x14, xzr + adcs x8, xzr, xzr + adds x13, x1, x13 + mov x1, v19.d[1] + adcs x6, x4, x6 + mov x4, #0xffffffff // #4294967295 + adcs x15, x7, x15 + adcs x7, x11, x5 + adcs x1, x3, x1 + adcs x14, x14, x10 + adc x11, x8, xzr + adds x6, x6, x17 + adcs x8, x15, x12 + adcs x3, x7, x9 + adcs x15, x1, x16 + mov x16, #0xffffffff00000001 // #-4294967295 + adcs x14, x14, x2 + mov x2, #0x1 // #1 + adc x17, x11, xzr + cmn x13, x16 + adcs xzr, x6, x4 + adcs xzr, x8, x2 + adcs xzr, x3, xzr + adcs xzr, x15, xzr + adcs xzr, x14, xzr + adc x1, x17, xzr + neg x9, x1 + and x1, x16, x9 + adds x11, x13, x1 + and x13, x4, x9 + adcs x5, x6, x13 + and x1, x2, x9 + adcs x7, x8, x1 + stp x11, x5, [x0] + adcs x11, x3, xzr + adcs x2, x15, xzr + stp x7, x11, [x0, #16] + adc x17, x14, xzr + stp x2, x17, [x0, #32] + ldr q3, [sp, #144] + ldr q25, [sp, #192] + ldp x13, x23, [sp, #192] + ldp x3, x21, [sp, #144] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [sp, #224] + ldp x8, x24, [sp, #160] + subs x6, x3, x21 + ldr q0, [sp, #176] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [sp, #208] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [sp, #224] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [sp, #176] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #192] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #208] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #224] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #192] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #208] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #224] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #192] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #208] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #224] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #192] + ldp x21, x12, [sp, #208] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #224] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [sp, #192] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [sp, #208] + adc x12, x15, x23 + stp x21, x12, [sp, #224] + ldr q3, [sp, #144] + ldr q25, [sp, #96] + ldp x13, x23, [sp, #96] + ldp x3, x21, [sp, #144] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [sp, #128] + ldp x8, x24, [sp, #160] + subs x6, x3, x21 + ldr q0, [sp, #176] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [sp, #112] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [sp, #128] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [sp, #176] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #96] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #112] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #128] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #96] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #112] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #128] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #96] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #112] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #128] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #96] + ldp x21, x12, [sp, #112] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #128] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x2, x24, x11 + stp x22, x5, [sp, #96] + adcs x11, x13, x23 + adcs x12, x8, x23 + stp x2, x11, [sp, #112] + adc x13, x15, x23 + stp x12, x13, [sp, #128] + mov x0, sp + mov x1, sp + ldp x5, x6, [x1] + ldp x4, x3, [sp, #192] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [sp, #208] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x1, #32] + ldp x4, x3, [sp, #224] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + csetm x3, cc // cc = lo, ul, last + mov x4, #0xffffffff // #4294967295 + and x4, x4, x3 + adds x5, x5, x4 + eor x4, x4, x3 + adcs x6, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + and x4, x4, x3 + adcs x7, x7, x4 + adcs x8, x8, x3 + adcs x9, x9, x3 + adc x10, x10, x3 + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + stp x9, x10, [x0, #32] + ldp x5, x6, [sp, #96] + ldp x4, x3, [sp, #192] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x4, x3, [sp, #208] + sbcs x7, x2, x4 + sbcs x8, x11, x3 + ldp x4, x3, [sp, #224] + sbcs x9, x12, x4 + sbcs x10, x13, x3 + csetm x3, cc // cc = lo, ul, last + mov x4, #0xffffffff // #4294967295 + and x4, x4, x3 + adds x5, x5, x4 + eor x4, x4, x3 + adcs x6, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + and x4, x4, x3 + adcs x7, x7, x4 + adcs x8, x8, x3 + adcs x9, x9, x3 + adc x10, x10, x3 + stp x5, x6, [sp, #144] + stp x7, x8, [sp, #160] + stp x9, x10, [sp, #176] + ldr q3, [sp, #240] + ldr q25, [x25, #96] + ldp x13, x23, [x25, #96] + ldp x3, x21, [sp, #240] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [x25, #128] + ldp x8, x24, [sp, #256] + subs x6, x3, x21 + ldr q0, [sp, #272] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [x25, #112] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [x25, #128] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [sp, #272] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #240] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #256] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #272] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #240] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #256] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #272] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #240] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #256] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #272] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #240] + ldp x21, x12, [sp, #256] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #272] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [sp, #240] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [sp, #256] + adc x12, x15, x23 + stp x21, x12, [sp, #272] + mov x0, sp + mov x1, sp + ldp x5, x6, [x1] + ldp x4, x3, [sp, #96] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [sp, #112] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x1, #32] + ldp x4, x3, [sp, #128] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + csetm x3, cc // cc = lo, ul, last + mov x4, #0xffffffff // #4294967295 + and x4, x4, x3 + adds x2, x5, x4 + eor x4, x4, x3 + adcs x11, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + and x4, x4, x3 + adcs x4, x7, x4 + adcs x12, x8, x3 + adcs x13, x9, x3 + adc x3, x10, x3 + stp x2, x11, [x0] + stp x4, x12, [x0, #16] + stp x13, x3, [x0, #32] + ldp x5, x6, [sp, #192] + subs x5, x5, x2 + sbcs x6, x6, x11 + ldp x7, x8, [sp, #208] + sbcs x7, x7, x4 + sbcs x8, x8, x12 + ldp x9, x10, [sp, #224] + sbcs x9, x9, x13 + sbcs x10, x10, x3 + csetm x3, cc // cc = lo, ul, last + mov x4, #0xffffffff // #4294967295 + and x4, x4, x3 + adds x5, x5, x4 + eor x4, x4, x3 + adcs x6, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + and x4, x4, x3 + adcs x7, x7, x4 + adcs x8, x8, x3 + adcs x9, x9, x3 + adc x10, x10, x3 + stp x5, x6, [sp, #192] + stp x7, x8, [sp, #208] + stp x9, x10, [sp, #224] + ldr q3, [sp, #144] + ldr q25, [sp, #288] + ldp x13, x23, [sp, #288] + ldp x3, x21, [sp, #144] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [sp, #320] + ldp x8, x24, [sp, #160] + subs x6, x3, x21 + ldr q0, [sp, #176] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [sp, #304] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [sp, #320] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [sp, #176] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #144] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #160] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #176] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #144] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #160] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #176] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #144] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #160] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #176] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #144] + ldp x21, x12, [sp, #160] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #176] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [sp, #144] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [sp, #160] + adc x12, x15, x23 + stp x21, x12, [sp, #176] + ldr q3, [sp, #240] + ldr q25, [x26, #96] + ldp x13, x23, [x26, #96] + ldp x3, x21, [sp, #240] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [x26, #128] + ldp x8, x24, [sp, #256] + subs x6, x3, x21 + ldr q0, [sp, #272] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [x26, #112] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [x26, #128] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [sp, #272] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #240] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #256] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #272] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #240] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #256] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #272] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #240] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #256] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #272] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #240] + ldp x21, x12, [sp, #256] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #272] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [sp, #240] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [sp, #256] + adc x12, x15, x23 + stp x21, x12, [sp, #272] + ldp x2, x27, [sp, #0x150] + ldr q3, [sp, #48] + ldr q25, [sp, #192] + ldp x13, x23, [sp, #192] + ldp x3, x21, [sp, #48] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [sp, #224] + ldp x8, x24, [sp, #64] + subs x6, x3, x21 + ldr q0, [sp, #80] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [sp, #208] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [sp, #224] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [sp, #80] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #192] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #208] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #224] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #192] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #208] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #224] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #192] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #208] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #224] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #192] + ldp x21, x12, [sp, #208] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #224] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x2, x6, x20 + eor x3, x20, x23 + adcs x6, x7, x3 + adcs x7, x24, x11 + adcs x9, x13, x23 + adcs x10, x8, x23 + adc x11, x15, x23 + ldp x4, x3, [sp, #144] + subs x5, x2, x4 + sbcs x6, x6, x3 + ldp x4, x3, [sp, #160] + sbcs x7, x7, x4 + sbcs x8, x9, x3 + ldp x4, x3, [sp, #176] + sbcs x9, x10, x4 + sbcs x10, x11, x3 + csetm x3, cc // cc = lo, ul, last + mov x4, #0xffffffff // #4294967295 + and x4, x4, x3 + adds x19, x5, x4 + eor x4, x4, x3 + adcs x24, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + and x4, x4, x3 + adcs x7, x7, x4 + adcs x8, x8, x3 + adcs x9, x9, x3 + adc x10, x10, x3 + stp x7, x8, [sp, #208] + stp x9, x10, [sp, #224] + ldp x0, x1, [x25, #96] + ldp x2, x3, [x25, #112] + ldp x4, x5, [x25, #128] + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x20, x20, x21 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne // ne = any + ldp x6, x7, [x26, #96] + ldp x8, x9, [x26, #112] + ldp x10, x11, [x26, #128] + orr x21, x6, x7 + orr x22, x8, x9 + orr x23, x10, x11 + orr x21, x21, x22 + orr x21, x21, x23 + cmp x21, xzr + cset x21, ne // ne = any + cmp x21, x20 + ldp x12, x13, [sp, #240] + csel x12, x0, x12, cc // cc = lo, ul, last + csel x13, x1, x13, cc // cc = lo, ul, last + csel x12, x6, x12, hi // hi = pmore + csel x13, x7, x13, hi // hi = pmore + ldp x14, x15, [sp, #256] + csel x14, x2, x14, cc // cc = lo, ul, last + csel x15, x3, x15, cc // cc = lo, ul, last + csel x14, x8, x14, hi // hi = pmore + csel x15, x9, x15, hi // hi = pmore + ldp x16, x17, [sp, #272] + csel x16, x4, x16, cc // cc = lo, ul, last + csel x17, x5, x17, cc // cc = lo, ul, last + csel x16, x10, x16, hi // hi = pmore + csel x17, x11, x17, hi // hi = pmore + ldp x20, x21, [x25] + ldp x0, x1, [sp] + csel x0, x20, x0, cc // cc = lo, ul, last + csel x1, x21, x1, cc // cc = lo, ul, last + ldp x20, x21, [x26] + csel x0, x20, x0, hi // hi = pmore + csel x1, x21, x1, hi // hi = pmore + ldp x20, x21, [x25, #16] + ldp x2, x3, [sp, #16] + csel x2, x20, x2, cc // cc = lo, ul, last + csel x3, x21, x3, cc // cc = lo, ul, last + ldp x20, x21, [x26, #16] + csel x2, x20, x2, hi // hi = pmore + csel x3, x21, x3, hi // hi = pmore + ldp x20, x21, [x25, #32] + ldp x4, x5, [sp, #32] + csel x4, x20, x4, cc // cc = lo, ul, last + csel x5, x21, x5, cc // cc = lo, ul, last + ldp x20, x21, [x26, #32] + csel x4, x20, x4, hi // hi = pmore + csel x5, x21, x5, hi // hi = pmore + ldp x20, x21, [x25, #48] + csel x6, x20, x19, cc // cc = lo, ul, last + csel x7, x21, x24, cc // cc = lo, ul, last + ldp x20, x21, [x26, #48] + csel x6, x20, x6, hi // hi = pmore + csel x7, x21, x7, hi // hi = pmore + ldp x20, x21, [x25, #64] + ldp x8, x9, [sp, #208] + csel x8, x20, x8, cc // cc = lo, ul, last + csel x9, x21, x9, cc // cc = lo, ul, last + ldp x20, x21, [x26, #64] + csel x8, x20, x8, hi // hi = pmore + csel x9, x21, x9, hi // hi = pmore + ldp x20, x21, [x25, #80] + ldp x10, x11, [sp, #224] + csel x10, x20, x10, cc // cc = lo, ul, last + csel x11, x21, x11, cc // cc = lo, ul, last + ldp x20, x21, [x26, #80] + csel x10, x20, x10, hi // hi = pmore + csel x11, x21, x11, hi // hi = pmore + stp x0, x1, [x27] + stp x2, x3, [x27, #16] + stp x4, x5, [x27, #32] + stp x6, x7, [x27, #48] + stp x8, x9, [x27, #64] + stp x10, x11, [x27, #80] + stp x12, x13, [x27, #96] + stp x14, x15, [x27, #112] + stp x16, x17, [x27, #128] + add sp, sp, #0x180 + ldp x27, xzr, [sp], #16 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +p384_montjscalarmul_p384_montjdouble: + sub sp, sp, #0x1a0 + stp x19, x20, [sp, #336] + stp x21, x22, [sp, #352] + stp x23, x24, [sp, #368] + stp x25, x26, [sp, #384] + stp x27, xzr, [sp, #400] + mov x25, x0 + mov x26, x1 + mov x0, sp + ldr q1, [x26, #96] + ldp x9, x2, [x26, #96] + ldr q0, [x26, #96] + ldp x4, x6, [x26, #112] + rev64 v21.4s, v1.4s + uzp2 v28.4s, v1.4s, v1.4s + umulh x7, x9, x2 + xtn v17.2s, v1.2d + mul v27.4s, v21.4s, v0.4s + ldr q20, [x26, #128] + xtn v30.2s, v0.2d + ldr q1, [x26, #128] + uzp2 v31.4s, v0.4s, v0.4s + ldp x5, x10, [x26, #128] + umulh x8, x9, x4 + uaddlp v3.2d, v27.4s + umull v16.2d, v30.2s, v17.2s + mul x16, x9, x4 + umull v27.2d, v30.2s, v28.2s + shrn v0.2s, v20.2d, #32 + xtn v7.2s, v20.2d + shl v20.2d, v3.2d, #32 + umull v3.2d, v31.2s, v28.2s + mul x3, x2, x4 + umlal v20.2d, v30.2s, v17.2s + umull v22.2d, v7.2s, v0.2s + usra v27.2d, v16.2d, #32 + umulh x11, x2, x4 + movi v21.2d, #0xffffffff + uzp2 v28.4s, v1.4s, v1.4s + adds x15, x16, x7 + and v5.16b, v27.16b, v21.16b + adcs x3, x3, x8 + usra v3.2d, v27.2d, #32 + dup v29.2d, x6 + adcs x16, x11, xzr + mov x14, v20.d[0] + umlal v5.2d, v31.2s, v17.2s + mul x8, x9, x2 + mov x7, v20.d[1] + shl v19.2d, v22.2d, #33 + xtn v25.2s, v29.2d + rev64 v31.4s, v1.4s + lsl x13, x14, #32 + uzp2 v6.4s, v29.4s, v29.4s + umlal v19.2d, v7.2s, v7.2s + usra v3.2d, v5.2d, #32 + adds x1, x8, x8 + umulh x8, x4, x4 + add x12, x13, x14 + mul v17.4s, v31.4s, v29.4s + xtn v4.2s, v1.2d + adcs x14, x15, x15 + lsr x13, x12, #32 + adcs x15, x3, x3 + umull v31.2d, v25.2s, v28.2s + adcs x11, x16, x16 + umull v21.2d, v25.2s, v4.2s + mov x17, v3.d[0] + umull v18.2d, v6.2s, v28.2s + adc x16, x8, xzr + uaddlp v16.2d, v17.4s + movi v1.2d, #0xffffffff + subs x13, x13, x12 + usra v31.2d, v21.2d, #32 + sbc x8, x12, xzr + adds x17, x17, x1 + mul x1, x4, x4 + shl v28.2d, v16.2d, #32 + mov x3, v3.d[1] + adcs x14, x7, x14 + extr x7, x8, x13, #32 + adcs x13, x3, x15 + and v3.16b, v31.16b, v1.16b + adcs x11, x1, x11 + lsr x1, x8, #32 + umlal v3.2d, v6.2s, v4.2s + usra v18.2d, v31.2d, #32 + adc x3, x16, xzr + adds x1, x1, x12 + umlal v28.2d, v25.2s, v4.2s + adc x16, xzr, xzr + subs x15, x17, x7 + sbcs x7, x14, x1 + lsl x1, x15, #32 + sbcs x16, x13, x16 + add x8, x1, x15 + usra v18.2d, v3.2d, #32 + sbcs x14, x11, xzr + lsr x1, x8, #32 + sbcs x17, x3, xzr + sbc x11, x12, xzr + subs x13, x1, x8 + umulh x12, x4, x10 + sbc x1, x8, xzr + extr x13, x1, x13, #32 + lsr x1, x1, #32 + adds x15, x1, x8 + adc x1, xzr, xzr + subs x7, x7, x13 + sbcs x13, x16, x15 + lsl x3, x7, #32 + umulh x16, x2, x5 + sbcs x15, x14, x1 + add x7, x3, x7 + sbcs x3, x17, xzr + lsr x1, x7, #32 + sbcs x14, x11, xzr + sbc x11, x8, xzr + subs x8, x1, x7 + sbc x1, x7, xzr + extr x8, x1, x8, #32 + lsr x1, x1, #32 + adds x1, x1, x7 + adc x17, xzr, xzr + subs x13, x13, x8 + umulh x8, x9, x6 + sbcs x1, x15, x1 + sbcs x15, x3, x17 + sbcs x3, x14, xzr + mul x17, x2, x5 + sbcs x11, x11, xzr + stp x13, x1, [x0] + sbc x14, x7, xzr + mul x7, x4, x10 + subs x1, x9, x2 + stp x15, x3, [x0, #16] + csetm x15, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + stp x11, x14, [x0, #32] + mul x14, x9, x6 + adds x17, x8, x17 + adcs x7, x16, x7 + adc x13, x12, xzr + subs x12, x5, x6 + cneg x3, x12, cc // cc = lo, ul, last + cinv x16, x15, cc // cc = lo, ul, last + mul x8, x1, x3 + umulh x1, x1, x3 + eor x12, x8, x16 + adds x11, x17, x14 + adcs x3, x7, x17 + adcs x15, x13, x7 + adc x8, x13, xzr + adds x3, x3, x14 + adcs x15, x15, x17 + adcs x17, x8, x7 + eor x1, x1, x16 + adc x13, x13, xzr + subs x9, x9, x4 + csetm x8, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x4, x2, x4 + cneg x4, x4, cc // cc = lo, ul, last + csetm x7, cc // cc = lo, ul, last + subs x2, x10, x6 + cinv x8, x8, cc // cc = lo, ul, last + cneg x2, x2, cc // cc = lo, ul, last + cmn x16, #0x1 + adcs x11, x11, x12 + mul x12, x9, x2 + adcs x3, x3, x1 + adcs x15, x15, x16 + umulh x9, x9, x2 + adcs x17, x17, x16 + adc x13, x13, x16 + subs x1, x10, x5 + cinv x2, x7, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + eor x9, x9, x8 + cmn x8, #0x1 + eor x7, x12, x8 + mul x12, x4, x1 + adcs x3, x3, x7 + adcs x7, x15, x9 + adcs x15, x17, x8 + ldp x9, x17, [x0, #16] + umulh x4, x4, x1 + adc x8, x13, x8 + cmn x2, #0x1 + eor x1, x12, x2 + adcs x1, x7, x1 + ldp x7, x16, [x0] + eor x12, x4, x2 + adcs x4, x15, x12 + ldp x15, x12, [x0, #32] + adc x8, x8, x2 + adds x13, x14, x14 + umulh x14, x5, x10 + adcs x2, x11, x11 + adcs x3, x3, x3 + adcs x1, x1, x1 + adcs x4, x4, x4 + adcs x11, x8, x8 + adc x8, xzr, xzr + adds x13, x13, x7 + adcs x2, x2, x16 + mul x16, x5, x10 + adcs x3, x3, x9 + adcs x1, x1, x17 + umulh x5, x5, x5 + lsl x9, x13, #32 + add x9, x9, x13 + adcs x4, x4, x15 + mov x13, v28.d[1] + adcs x15, x11, x12 + lsr x7, x9, #32 + adc x11, x8, xzr + subs x7, x7, x9 + umulh x10, x10, x10 + sbc x17, x9, xzr + extr x7, x17, x7, #32 + lsr x17, x17, #32 + adds x17, x17, x9 + adc x12, xzr, xzr + subs x8, x2, x7 + sbcs x17, x3, x17 + lsl x7, x8, #32 + sbcs x2, x1, x12 + add x3, x7, x8 + sbcs x12, x4, xzr + lsr x1, x3, #32 + sbcs x7, x15, xzr + sbc x15, x9, xzr + subs x1, x1, x3 + sbc x4, x3, xzr + lsr x9, x4, #32 + extr x8, x4, x1, #32 + adds x9, x9, x3 + adc x4, xzr, xzr + subs x1, x17, x8 + lsl x17, x1, #32 + sbcs x8, x2, x9 + sbcs x9, x12, x4 + add x17, x17, x1 + mov x1, v18.d[1] + lsr x2, x17, #32 + sbcs x7, x7, xzr + mov x12, v18.d[0] + sbcs x15, x15, xzr + sbc x3, x3, xzr + subs x4, x2, x17 + sbc x2, x17, xzr + adds x12, x13, x12 + adcs x16, x16, x1 + lsr x13, x2, #32 + extr x1, x2, x4, #32 + adc x2, x14, xzr + adds x4, x13, x17 + mul x13, x6, x6 + adc x14, xzr, xzr + subs x1, x8, x1 + sbcs x4, x9, x4 + mov x9, v28.d[0] + sbcs x7, x7, x14 + sbcs x8, x15, xzr + sbcs x3, x3, xzr + sbc x14, x17, xzr + adds x17, x9, x9 + adcs x12, x12, x12 + mov x15, v19.d[0] + adcs x9, x16, x16 + umulh x6, x6, x6 + adcs x16, x2, x2 + adc x2, xzr, xzr + adds x11, x11, x8 + adcs x3, x3, xzr + adcs x14, x14, xzr + adcs x8, xzr, xzr + adds x13, x1, x13 + mov x1, v19.d[1] + adcs x6, x4, x6 + mov x4, #0xffffffff // #4294967295 + adcs x15, x7, x15 + adcs x7, x11, x5 + adcs x1, x3, x1 + adcs x14, x14, x10 + adc x11, x8, xzr + adds x6, x6, x17 + adcs x8, x15, x12 + adcs x3, x7, x9 + adcs x15, x1, x16 + mov x16, #0xffffffff00000001 // #-4294967295 + adcs x14, x14, x2 + mov x2, #0x1 // #1 + adc x17, x11, xzr + cmn x13, x16 + adcs xzr, x6, x4 + adcs xzr, x8, x2 + adcs xzr, x3, xzr + adcs xzr, x15, xzr + adcs xzr, x14, xzr + adc x1, x17, xzr + neg x9, x1 + and x1, x16, x9 + adds x11, x13, x1 + and x13, x4, x9 + adcs x5, x6, x13 + and x1, x2, x9 + adcs x7, x8, x1 + stp x11, x5, [x0] + adcs x11, x3, xzr + adcs x2, x15, xzr + stp x7, x11, [x0, #16] + adc x17, x14, xzr + stp x2, x17, [x0, #32] + ldr q1, [x26, #48] + ldp x9, x2, [x26, #48] + ldr q0, [x26, #48] + ldp x4, x6, [x26, #64] + rev64 v21.4s, v1.4s + uzp2 v28.4s, v1.4s, v1.4s + umulh x7, x9, x2 + xtn v17.2s, v1.2d + mul v27.4s, v21.4s, v0.4s + ldr q20, [x26, #80] + xtn v30.2s, v0.2d + ldr q1, [x26, #80] + uzp2 v31.4s, v0.4s, v0.4s + ldp x5, x10, [x26, #80] + umulh x8, x9, x4 + uaddlp v3.2d, v27.4s + umull v16.2d, v30.2s, v17.2s + mul x16, x9, x4 + umull v27.2d, v30.2s, v28.2s + shrn v0.2s, v20.2d, #32 + xtn v7.2s, v20.2d + shl v20.2d, v3.2d, #32 + umull v3.2d, v31.2s, v28.2s + mul x3, x2, x4 + umlal v20.2d, v30.2s, v17.2s + umull v22.2d, v7.2s, v0.2s + usra v27.2d, v16.2d, #32 + umulh x11, x2, x4 + movi v21.2d, #0xffffffff + uzp2 v28.4s, v1.4s, v1.4s + adds x15, x16, x7 + and v5.16b, v27.16b, v21.16b + adcs x3, x3, x8 + usra v3.2d, v27.2d, #32 + dup v29.2d, x6 + adcs x16, x11, xzr + mov x14, v20.d[0] + umlal v5.2d, v31.2s, v17.2s + mul x8, x9, x2 + mov x7, v20.d[1] + shl v19.2d, v22.2d, #33 + xtn v25.2s, v29.2d + rev64 v31.4s, v1.4s + lsl x13, x14, #32 + uzp2 v6.4s, v29.4s, v29.4s + umlal v19.2d, v7.2s, v7.2s + usra v3.2d, v5.2d, #32 + adds x1, x8, x8 + umulh x8, x4, x4 + add x12, x13, x14 + mul v17.4s, v31.4s, v29.4s + xtn v4.2s, v1.2d + adcs x14, x15, x15 + lsr x13, x12, #32 + adcs x15, x3, x3 + umull v31.2d, v25.2s, v28.2s + adcs x11, x16, x16 + umull v21.2d, v25.2s, v4.2s + mov x17, v3.d[0] + umull v18.2d, v6.2s, v28.2s + adc x16, x8, xzr + uaddlp v16.2d, v17.4s + movi v1.2d, #0xffffffff + subs x13, x13, x12 + usra v31.2d, v21.2d, #32 + sbc x8, x12, xzr + adds x17, x17, x1 + mul x1, x4, x4 + shl v28.2d, v16.2d, #32 + mov x3, v3.d[1] + adcs x14, x7, x14 + extr x7, x8, x13, #32 + adcs x13, x3, x15 + and v3.16b, v31.16b, v1.16b + adcs x11, x1, x11 + lsr x1, x8, #32 + umlal v3.2d, v6.2s, v4.2s + usra v18.2d, v31.2d, #32 + adc x3, x16, xzr + adds x1, x1, x12 + umlal v28.2d, v25.2s, v4.2s + adc x16, xzr, xzr + subs x15, x17, x7 + sbcs x7, x14, x1 + lsl x1, x15, #32 + sbcs x16, x13, x16 + add x8, x1, x15 + usra v18.2d, v3.2d, #32 + sbcs x14, x11, xzr + lsr x1, x8, #32 + sbcs x17, x3, xzr + sbc x11, x12, xzr + subs x13, x1, x8 + umulh x12, x4, x10 + sbc x1, x8, xzr + extr x13, x1, x13, #32 + lsr x1, x1, #32 + adds x15, x1, x8 + adc x1, xzr, xzr + subs x7, x7, x13 + sbcs x13, x16, x15 + lsl x3, x7, #32 + umulh x16, x2, x5 + sbcs x15, x14, x1 + add x7, x3, x7 + sbcs x3, x17, xzr + lsr x1, x7, #32 + sbcs x14, x11, xzr + sbc x11, x8, xzr + subs x8, x1, x7 + sbc x1, x7, xzr + extr x8, x1, x8, #32 + lsr x1, x1, #32 + adds x1, x1, x7 + adc x17, xzr, xzr + subs x13, x13, x8 + umulh x8, x9, x6 + sbcs x1, x15, x1 + sbcs x15, x3, x17 + sbcs x3, x14, xzr + mul x17, x2, x5 + sbcs x11, x11, xzr + stp x13, x1, [sp, #48] + sbc x14, x7, xzr + mul x7, x4, x10 + subs x1, x9, x2 + stp x15, x3, [sp, #64] + csetm x15, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + stp x11, x14, [sp, #80] + mul x14, x9, x6 + adds x17, x8, x17 + adcs x7, x16, x7 + adc x13, x12, xzr + subs x12, x5, x6 + cneg x3, x12, cc // cc = lo, ul, last + cinv x16, x15, cc // cc = lo, ul, last + mul x8, x1, x3 + umulh x1, x1, x3 + eor x12, x8, x16 + adds x11, x17, x14 + adcs x3, x7, x17 + adcs x15, x13, x7 + adc x8, x13, xzr + adds x3, x3, x14 + adcs x15, x15, x17 + adcs x17, x8, x7 + eor x1, x1, x16 + adc x13, x13, xzr + subs x9, x9, x4 + csetm x8, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x4, x2, x4 + cneg x4, x4, cc // cc = lo, ul, last + csetm x7, cc // cc = lo, ul, last + subs x2, x10, x6 + cinv x8, x8, cc // cc = lo, ul, last + cneg x2, x2, cc // cc = lo, ul, last + cmn x16, #0x1 + adcs x11, x11, x12 + mul x12, x9, x2 + adcs x3, x3, x1 + adcs x15, x15, x16 + umulh x9, x9, x2 + adcs x17, x17, x16 + adc x13, x13, x16 + subs x1, x10, x5 + cinv x2, x7, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + eor x9, x9, x8 + cmn x8, #0x1 + eor x7, x12, x8 + mul x12, x4, x1 + adcs x3, x3, x7 + adcs x7, x15, x9 + adcs x15, x17, x8 + ldp x9, x17, [sp, #64] + umulh x4, x4, x1 + adc x8, x13, x8 + cmn x2, #0x1 + eor x1, x12, x2 + adcs x1, x7, x1 + ldp x7, x16, [sp, #48] + eor x12, x4, x2 + adcs x4, x15, x12 + ldp x15, x12, [sp, #80] + adc x8, x8, x2 + adds x13, x14, x14 + umulh x14, x5, x10 + adcs x2, x11, x11 + adcs x3, x3, x3 + adcs x1, x1, x1 + adcs x4, x4, x4 + adcs x11, x8, x8 + adc x8, xzr, xzr + adds x13, x13, x7 + adcs x2, x2, x16 + mul x16, x5, x10 + adcs x3, x3, x9 + adcs x1, x1, x17 + umulh x5, x5, x5 + lsl x9, x13, #32 + add x9, x9, x13 + adcs x4, x4, x15 + mov x13, v28.d[1] + adcs x15, x11, x12 + lsr x7, x9, #32 + adc x11, x8, xzr + subs x7, x7, x9 + umulh x10, x10, x10 + sbc x17, x9, xzr + extr x7, x17, x7, #32 + lsr x17, x17, #32 + adds x17, x17, x9 + adc x12, xzr, xzr + subs x8, x2, x7 + sbcs x17, x3, x17 + lsl x7, x8, #32 + sbcs x2, x1, x12 + add x3, x7, x8 + sbcs x12, x4, xzr + lsr x1, x3, #32 + sbcs x7, x15, xzr + sbc x15, x9, xzr + subs x1, x1, x3 + sbc x4, x3, xzr + lsr x9, x4, #32 + extr x8, x4, x1, #32 + adds x9, x9, x3 + adc x4, xzr, xzr + subs x1, x17, x8 + lsl x17, x1, #32 + sbcs x8, x2, x9 + sbcs x9, x12, x4 + add x17, x17, x1 + mov x1, v18.d[1] + lsr x2, x17, #32 + sbcs x7, x7, xzr + mov x12, v18.d[0] + sbcs x15, x15, xzr + sbc x3, x3, xzr + subs x4, x2, x17 + sbc x2, x17, xzr + adds x12, x13, x12 + adcs x16, x16, x1 + lsr x13, x2, #32 + extr x1, x2, x4, #32 + adc x2, x14, xzr + adds x4, x13, x17 + mul x13, x6, x6 + adc x14, xzr, xzr + subs x1, x8, x1 + sbcs x4, x9, x4 + mov x9, v28.d[0] + sbcs x7, x7, x14 + sbcs x8, x15, xzr + sbcs x3, x3, xzr + sbc x14, x17, xzr + adds x17, x9, x9 + adcs x12, x12, x12 + mov x15, v19.d[0] + adcs x9, x16, x16 + umulh x6, x6, x6 + adcs x16, x2, x2 + adc x2, xzr, xzr + adds x11, x11, x8 + adcs x3, x3, xzr + adcs x14, x14, xzr + adcs x8, xzr, xzr + adds x13, x1, x13 + mov x1, v19.d[1] + adcs x6, x4, x6 + mov x4, #0xffffffff // #4294967295 + adcs x15, x7, x15 + adcs x7, x11, x5 + adcs x1, x3, x1 + adcs x14, x14, x10 + adc x11, x8, xzr + adds x6, x6, x17 + adcs x8, x15, x12 + adcs x3, x7, x9 + adcs x15, x1, x16 + mov x16, #0xffffffff00000001 // #-4294967295 + adcs x14, x14, x2 + mov x2, #0x1 // #1 + adc x17, x11, xzr + cmn x13, x16 + adcs xzr, x6, x4 + adcs xzr, x8, x2 + adcs xzr, x3, xzr + adcs xzr, x15, xzr + adcs xzr, x14, xzr + adc x1, x17, xzr + neg x9, x1 + and x1, x16, x9 + adds x11, x13, x1 + and x13, x4, x9 + adcs x5, x6, x13 + and x1, x2, x9 + adcs x7, x8, x1 + stp x11, x5, [sp, #48] + adcs x11, x3, xzr + adcs x2, x15, xzr + stp x7, x11, [sp, #64] + adc x17, x14, xzr + stp x2, x17, [sp, #80] + ldp x5, x6, [x26] + ldp x4, x3, [sp] + adds x5, x5, x4 + adcs x6, x6, x3 + ldp x7, x8, [x26, #16] + ldp x4, x3, [sp, #16] + adcs x7, x7, x4 + adcs x8, x8, x3 + ldp x9, x10, [x26, #32] + ldp x4, x3, [sp, #32] + adcs x9, x9, x4 + adcs x10, x10, x3 + csetm x3, cs // cs = hs, nlast + mov x4, #0xffffffff // #4294967295 + and x4, x4, x3 + subs x5, x5, x4 + eor x4, x4, x3 + sbcs x6, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + and x4, x4, x3 + sbcs x7, x7, x4 + sbcs x8, x8, x3 + sbcs x9, x9, x3 + sbc x10, x10, x3 + stp x5, x6, [sp, #240] + stp x7, x8, [sp, #256] + stp x9, x10, [sp, #272] + mov x2, sp + ldp x5, x6, [x26] + ldp x4, x3, [x2] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x26, #16] + ldp x4, x3, [x2, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x26, #32] + ldp x4, x3, [x2, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + csetm x3, cc // cc = lo, ul, last + mov x4, #0xffffffff // #4294967295 + and x4, x4, x3 + adds x13, x5, x4 + eor x4, x4, x3 + adcs x23, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + and x4, x4, x3 + adcs x7, x7, x4 + adcs x8, x8, x3 + adcs x9, x9, x3 + adc x10, x10, x3 + stp x13, x23, [sp, #192] + stp x7, x8, [sp, #208] + stp x9, x10, [sp, #224] + ldr q3, [sp, #240] + ldr q25, [sp, #192] + ldp x3, x21, [sp, #240] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [sp, #224] + ldp x8, x24, [sp, #256] + subs x6, x3, x21 + ldr q0, [sp, #272] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [sp, #208] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [sp, #224] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [sp, #272] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x16, x4, x16 + mov x4, v27.d[0] + sbcs x11, x20, x11 + sbcs x20, x9, x12 + stp x16, x11, [sp, #96] + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #112] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #128] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + ldp x20, x9, [sp, #96] + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #112] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #128] + adds x20, x22, x20 + mul x10, x13, x14 + adcs x11, x11, x9 + eor x9, x8, x21 + adcs x21, x19, x17 + stp x20, x11, [sp, #96] + adcs x12, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + stp x21, x12, [sp, #112] + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #128] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #96] + ldp x21, x12, [sp, #112] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #128] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x21 + eor x1, x22, x9 + adcs x24, x23, x12 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x21 + adcs x15, x17, x12 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [sp, #96] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [sp, #112] + adc x12, x15, x23 + stp x21, x12, [sp, #128] + ldp x5, x6, [x26, #48] + ldp x4, x3, [x26, #96] + adds x5, x5, x4 + adcs x6, x6, x3 + ldp x7, x8, [x26, #64] + ldp x4, x3, [x26, #112] + adcs x7, x7, x4 + adcs x8, x8, x3 + ldp x9, x10, [x26, #80] + ldp x4, x3, [x26, #128] + adcs x9, x9, x4 + adcs x10, x10, x3 + adc x3, xzr, xzr + mov x4, #0xffffffff // #4294967295 + cmp x5, x4 + mov x4, #0xffffffff00000000 // #-4294967296 + sbcs xzr, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + sbcs xzr, x7, x4 + adcs xzr, x8, xzr + adcs xzr, x9, xzr + adcs xzr, x10, xzr + adcs x3, x3, xzr + csetm x3, ne // ne = any + mov x4, #0xffffffff // #4294967295 + and x4, x4, x3 + subs x5, x5, x4 + eor x4, x4, x3 + sbcs x6, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + and x4, x4, x3 + sbcs x7, x7, x4 + sbcs x8, x8, x3 + sbcs x9, x9, x3 + sbc x10, x10, x3 + stp x5, x6, [sp, #240] + stp x7, x8, [sp, #256] + stp x9, x10, [sp, #272] + ldr q1, [sp, #96] + ldp x9, x2, [sp, #96] + ldr q0, [sp, #96] + ldp x4, x6, [sp, #112] + rev64 v21.4s, v1.4s + uzp2 v28.4s, v1.4s, v1.4s + umulh x7, x9, x2 + xtn v17.2s, v1.2d + mul v27.4s, v21.4s, v0.4s + ldr q20, [sp, #128] + xtn v30.2s, v0.2d + ldr q1, [sp, #128] + uzp2 v31.4s, v0.4s, v0.4s + ldp x5, x10, [sp, #128] + umulh x8, x9, x4 + uaddlp v3.2d, v27.4s + umull v16.2d, v30.2s, v17.2s + mul x16, x9, x4 + umull v27.2d, v30.2s, v28.2s + shrn v0.2s, v20.2d, #32 + xtn v7.2s, v20.2d + shl v20.2d, v3.2d, #32 + umull v3.2d, v31.2s, v28.2s + mul x3, x2, x4 + umlal v20.2d, v30.2s, v17.2s + umull v22.2d, v7.2s, v0.2s + usra v27.2d, v16.2d, #32 + umulh x11, x2, x4 + movi v21.2d, #0xffffffff + uzp2 v28.4s, v1.4s, v1.4s + adds x15, x16, x7 + and v5.16b, v27.16b, v21.16b + adcs x3, x3, x8 + usra v3.2d, v27.2d, #32 + dup v29.2d, x6 + adcs x16, x11, xzr + mov x14, v20.d[0] + umlal v5.2d, v31.2s, v17.2s + mul x8, x9, x2 + mov x7, v20.d[1] + shl v19.2d, v22.2d, #33 + xtn v25.2s, v29.2d + rev64 v31.4s, v1.4s + lsl x13, x14, #32 + uzp2 v6.4s, v29.4s, v29.4s + umlal v19.2d, v7.2s, v7.2s + usra v3.2d, v5.2d, #32 + adds x1, x8, x8 + umulh x8, x4, x4 + add x12, x13, x14 + mul v17.4s, v31.4s, v29.4s + xtn v4.2s, v1.2d + adcs x14, x15, x15 + lsr x13, x12, #32 + adcs x15, x3, x3 + umull v31.2d, v25.2s, v28.2s + adcs x11, x16, x16 + umull v21.2d, v25.2s, v4.2s + mov x17, v3.d[0] + umull v18.2d, v6.2s, v28.2s + adc x16, x8, xzr + uaddlp v16.2d, v17.4s + movi v1.2d, #0xffffffff + subs x13, x13, x12 + usra v31.2d, v21.2d, #32 + sbc x8, x12, xzr + adds x17, x17, x1 + mul x1, x4, x4 + shl v28.2d, v16.2d, #32 + mov x3, v3.d[1] + adcs x14, x7, x14 + extr x7, x8, x13, #32 + adcs x13, x3, x15 + and v3.16b, v31.16b, v1.16b + adcs x11, x1, x11 + lsr x1, x8, #32 + umlal v3.2d, v6.2s, v4.2s + usra v18.2d, v31.2d, #32 + adc x3, x16, xzr + adds x1, x1, x12 + umlal v28.2d, v25.2s, v4.2s + adc x16, xzr, xzr + subs x15, x17, x7 + sbcs x7, x14, x1 + lsl x1, x15, #32 + sbcs x16, x13, x16 + add x8, x1, x15 + usra v18.2d, v3.2d, #32 + sbcs x14, x11, xzr + lsr x1, x8, #32 + sbcs x17, x3, xzr + sbc x11, x12, xzr + subs x13, x1, x8 + umulh x12, x4, x10 + sbc x1, x8, xzr + extr x13, x1, x13, #32 + lsr x1, x1, #32 + adds x15, x1, x8 + adc x1, xzr, xzr + subs x7, x7, x13 + sbcs x13, x16, x15 + lsl x3, x7, #32 + umulh x16, x2, x5 + sbcs x15, x14, x1 + add x7, x3, x7 + sbcs x3, x17, xzr + lsr x1, x7, #32 + sbcs x14, x11, xzr + sbc x11, x8, xzr + subs x8, x1, x7 + sbc x1, x7, xzr + extr x8, x1, x8, #32 + lsr x1, x1, #32 + adds x1, x1, x7 + adc x17, xzr, xzr + subs x13, x13, x8 + umulh x8, x9, x6 + sbcs x1, x15, x1 + sbcs x15, x3, x17 + sbcs x3, x14, xzr + mul x17, x2, x5 + sbcs x11, x11, xzr + stp x13, x1, [sp, #288] + sbc x14, x7, xzr + mul x7, x4, x10 + subs x1, x9, x2 + stp x15, x3, [sp, #304] + csetm x15, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + stp x11, x14, [sp, #320] + mul x14, x9, x6 + adds x17, x8, x17 + adcs x7, x16, x7 + adc x13, x12, xzr + subs x12, x5, x6 + cneg x3, x12, cc // cc = lo, ul, last + cinv x16, x15, cc // cc = lo, ul, last + mul x8, x1, x3 + umulh x1, x1, x3 + eor x12, x8, x16 + adds x11, x17, x14 + adcs x3, x7, x17 + adcs x15, x13, x7 + adc x8, x13, xzr + adds x3, x3, x14 + adcs x15, x15, x17 + adcs x17, x8, x7 + eor x1, x1, x16 + adc x13, x13, xzr + subs x9, x9, x4 + csetm x8, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x4, x2, x4 + cneg x4, x4, cc // cc = lo, ul, last + csetm x7, cc // cc = lo, ul, last + subs x2, x10, x6 + cinv x8, x8, cc // cc = lo, ul, last + cneg x2, x2, cc // cc = lo, ul, last + cmn x16, #0x1 + adcs x11, x11, x12 + mul x12, x9, x2 + adcs x3, x3, x1 + adcs x15, x15, x16 + umulh x9, x9, x2 + adcs x17, x17, x16 + adc x13, x13, x16 + subs x1, x10, x5 + cinv x2, x7, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + eor x9, x9, x8 + cmn x8, #0x1 + eor x7, x12, x8 + mul x12, x4, x1 + adcs x3, x3, x7 + adcs x7, x15, x9 + adcs x15, x17, x8 + ldp x9, x17, [sp, #304] + umulh x4, x4, x1 + adc x8, x13, x8 + cmn x2, #0x1 + eor x1, x12, x2 + adcs x1, x7, x1 + ldp x7, x16, [sp, #288] + eor x12, x4, x2 + adcs x4, x15, x12 + ldp x15, x12, [sp, #320] + adc x8, x8, x2 + adds x13, x14, x14 + umulh x14, x5, x10 + adcs x2, x11, x11 + adcs x3, x3, x3 + adcs x1, x1, x1 + adcs x4, x4, x4 + adcs x11, x8, x8 + adc x8, xzr, xzr + adds x13, x13, x7 + adcs x2, x2, x16 + mul x16, x5, x10 + adcs x3, x3, x9 + adcs x1, x1, x17 + umulh x5, x5, x5 + lsl x9, x13, #32 + add x9, x9, x13 + adcs x4, x4, x15 + mov x13, v28.d[1] + adcs x15, x11, x12 + lsr x7, x9, #32 + adc x11, x8, xzr + subs x7, x7, x9 + umulh x10, x10, x10 + sbc x17, x9, xzr + extr x7, x17, x7, #32 + lsr x17, x17, #32 + adds x17, x17, x9 + adc x12, xzr, xzr + subs x8, x2, x7 + sbcs x17, x3, x17 + lsl x7, x8, #32 + sbcs x2, x1, x12 + add x3, x7, x8 + sbcs x12, x4, xzr + lsr x1, x3, #32 + sbcs x7, x15, xzr + sbc x15, x9, xzr + subs x1, x1, x3 + sbc x4, x3, xzr + lsr x9, x4, #32 + extr x8, x4, x1, #32 + adds x9, x9, x3 + adc x4, xzr, xzr + subs x1, x17, x8 + lsl x17, x1, #32 + sbcs x8, x2, x9 + sbcs x9, x12, x4 + add x17, x17, x1 + mov x1, v18.d[1] + lsr x2, x17, #32 + sbcs x7, x7, xzr + mov x12, v18.d[0] + sbcs x15, x15, xzr + sbc x3, x3, xzr + subs x4, x2, x17 + sbc x2, x17, xzr + adds x12, x13, x12 + adcs x16, x16, x1 + lsr x13, x2, #32 + extr x1, x2, x4, #32 + adc x2, x14, xzr + adds x4, x13, x17 + mul x13, x6, x6 + adc x14, xzr, xzr + subs x1, x8, x1 + sbcs x4, x9, x4 + mov x9, v28.d[0] + sbcs x7, x7, x14 + sbcs x8, x15, xzr + sbcs x3, x3, xzr + sbc x14, x17, xzr + adds x17, x9, x9 + adcs x12, x12, x12 + mov x15, v19.d[0] + adcs x9, x16, x16 + umulh x6, x6, x6 + adcs x16, x2, x2 + adc x2, xzr, xzr + adds x11, x11, x8 + adcs x3, x3, xzr + adcs x14, x14, xzr + adcs x8, xzr, xzr + adds x13, x1, x13 + mov x1, v19.d[1] + adcs x6, x4, x6 + mov x4, #0xffffffff // #4294967295 + adcs x15, x7, x15 + adcs x7, x11, x5 + adcs x1, x3, x1 + adcs x14, x14, x10 + adc x11, x8, xzr + adds x6, x6, x17 + adcs x8, x15, x12 + adcs x3, x7, x9 + adcs x15, x1, x16 + mov x16, #0xffffffff00000001 // #-4294967295 + adcs x14, x14, x2 + mov x2, #0x1 // #1 + adc x17, x11, xzr + cmn x13, x16 + adcs xzr, x6, x4 + adcs xzr, x8, x2 + adcs xzr, x3, xzr + adcs xzr, x15, xzr + adcs xzr, x14, xzr + adc x1, x17, xzr + neg x9, x1 + and x1, x16, x9 + adds x11, x13, x1 + and x13, x4, x9 + adcs x5, x6, x13 + and x1, x2, x9 + adcs x7, x8, x1 + stp x11, x5, [sp, #288] + adcs x11, x3, xzr + adcs x2, x15, xzr + stp x7, x11, [sp, #304] + adc x17, x14, xzr + stp x2, x17, [sp, #320] + ldr q3, [x26] + ldr q25, [sp, #48] + ldp x13, x23, [sp, #48] + ldp x3, x21, [x26] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [sp, #80] + ldp x8, x24, [x26, #16] + subs x6, x3, x21 + ldr q0, [x26, #32] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [sp, #64] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [sp, #80] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [x26, #32] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x26, x4, x16 + mov x4, v27.d[0] + sbcs x27, x20, x11 + sbcs x20, x9, x12 + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #160] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #176] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #160] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #176] + adds x20, x22, x26 + mul x10, x13, x14 + adcs x11, x11, x27 + eor x9, x8, x21 + adcs x26, x19, x17 + stp x20, x11, [sp, #144] + adcs x27, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #176] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #144] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #176] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x26 + eor x1, x22, x9 + adcs x24, x23, x27 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x26 + adcs x15, x17, x27 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [sp, #144] + adcs x5, x13, x23 + adcs x21, x8, x23 + stp x14, x5, [sp, #160] + adc x12, x15, x23 + stp x21, x12, [sp, #176] + ldr q1, [sp, #240] + ldp x9, x2, [sp, #240] + ldr q0, [sp, #240] + ldp x4, x6, [sp, #256] + rev64 v21.4s, v1.4s + uzp2 v28.4s, v1.4s, v1.4s + umulh x7, x9, x2 + xtn v17.2s, v1.2d + mul v27.4s, v21.4s, v0.4s + ldr q20, [sp, #272] + xtn v30.2s, v0.2d + ldr q1, [sp, #272] + uzp2 v31.4s, v0.4s, v0.4s + ldp x5, x10, [sp, #272] + umulh x8, x9, x4 + uaddlp v3.2d, v27.4s + umull v16.2d, v30.2s, v17.2s + mul x16, x9, x4 + umull v27.2d, v30.2s, v28.2s + shrn v0.2s, v20.2d, #32 + xtn v7.2s, v20.2d + shl v20.2d, v3.2d, #32 + umull v3.2d, v31.2s, v28.2s + mul x3, x2, x4 + umlal v20.2d, v30.2s, v17.2s + umull v22.2d, v7.2s, v0.2s + usra v27.2d, v16.2d, #32 + umulh x11, x2, x4 + movi v21.2d, #0xffffffff + uzp2 v28.4s, v1.4s, v1.4s + adds x15, x16, x7 + and v5.16b, v27.16b, v21.16b + adcs x3, x3, x8 + usra v3.2d, v27.2d, #32 + dup v29.2d, x6 + adcs x16, x11, xzr + mov x14, v20.d[0] + umlal v5.2d, v31.2s, v17.2s + mul x8, x9, x2 + mov x7, v20.d[1] + shl v19.2d, v22.2d, #33 + xtn v25.2s, v29.2d + rev64 v31.4s, v1.4s + lsl x13, x14, #32 + uzp2 v6.4s, v29.4s, v29.4s + umlal v19.2d, v7.2s, v7.2s + usra v3.2d, v5.2d, #32 + adds x1, x8, x8 + umulh x8, x4, x4 + add x12, x13, x14 + mul v17.4s, v31.4s, v29.4s + xtn v4.2s, v1.2d + adcs x14, x15, x15 + lsr x13, x12, #32 + adcs x15, x3, x3 + umull v31.2d, v25.2s, v28.2s + adcs x11, x16, x16 + umull v21.2d, v25.2s, v4.2s + mov x17, v3.d[0] + umull v18.2d, v6.2s, v28.2s + adc x16, x8, xzr + uaddlp v16.2d, v17.4s + movi v1.2d, #0xffffffff + subs x13, x13, x12 + usra v31.2d, v21.2d, #32 + sbc x8, x12, xzr + adds x17, x17, x1 + mul x1, x4, x4 + shl v28.2d, v16.2d, #32 + mov x3, v3.d[1] + adcs x14, x7, x14 + extr x7, x8, x13, #32 + adcs x13, x3, x15 + and v3.16b, v31.16b, v1.16b + adcs x11, x1, x11 + lsr x1, x8, #32 + umlal v3.2d, v6.2s, v4.2s + usra v18.2d, v31.2d, #32 + adc x3, x16, xzr + adds x1, x1, x12 + umlal v28.2d, v25.2s, v4.2s + adc x16, xzr, xzr + subs x15, x17, x7 + sbcs x7, x14, x1 + lsl x1, x15, #32 + sbcs x16, x13, x16 + add x8, x1, x15 + usra v18.2d, v3.2d, #32 + sbcs x14, x11, xzr + lsr x1, x8, #32 + sbcs x17, x3, xzr + sbc x11, x12, xzr + subs x13, x1, x8 + umulh x12, x4, x10 + sbc x1, x8, xzr + extr x13, x1, x13, #32 + lsr x1, x1, #32 + adds x15, x1, x8 + adc x1, xzr, xzr + subs x7, x7, x13 + sbcs x13, x16, x15 + lsl x3, x7, #32 + umulh x16, x2, x5 + sbcs x15, x14, x1 + add x7, x3, x7 + sbcs x3, x17, xzr + lsr x1, x7, #32 + sbcs x14, x11, xzr + sbc x11, x8, xzr + subs x8, x1, x7 + sbc x1, x7, xzr + extr x8, x1, x8, #32 + lsr x1, x1, #32 + adds x1, x1, x7 + adc x17, xzr, xzr + subs x13, x13, x8 + umulh x8, x9, x6 + sbcs x1, x15, x1 + sbcs x19, x3, x17 + sbcs x20, x14, xzr + mul x17, x2, x5 + sbcs x11, x11, xzr + stp x13, x1, [sp, #192] + sbc x14, x7, xzr + mul x7, x4, x10 + subs x1, x9, x2 + csetm x15, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + stp x11, x14, [sp, #224] + mul x14, x9, x6 + adds x17, x8, x17 + adcs x7, x16, x7 + adc x13, x12, xzr + subs x12, x5, x6 + cneg x3, x12, cc // cc = lo, ul, last + cinv x16, x15, cc // cc = lo, ul, last + mul x8, x1, x3 + umulh x1, x1, x3 + eor x12, x8, x16 + adds x11, x17, x14 + adcs x3, x7, x17 + adcs x15, x13, x7 + adc x8, x13, xzr + adds x3, x3, x14 + adcs x15, x15, x17 + adcs x17, x8, x7 + eor x1, x1, x16 + adc x13, x13, xzr + subs x9, x9, x4 + csetm x8, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x4, x2, x4 + cneg x4, x4, cc // cc = lo, ul, last + csetm x7, cc // cc = lo, ul, last + subs x2, x10, x6 + cinv x8, x8, cc // cc = lo, ul, last + cneg x2, x2, cc // cc = lo, ul, last + cmn x16, #0x1 + adcs x11, x11, x12 + mul x12, x9, x2 + adcs x3, x3, x1 + adcs x15, x15, x16 + umulh x9, x9, x2 + adcs x17, x17, x16 + adc x13, x13, x16 + subs x1, x10, x5 + cinv x2, x7, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + eor x9, x9, x8 + cmn x8, #0x1 + eor x7, x12, x8 + mul x12, x4, x1 + adcs x3, x3, x7 + adcs x7, x15, x9 + adcs x15, x17, x8 + umulh x4, x4, x1 + adc x8, x13, x8 + cmn x2, #0x1 + eor x1, x12, x2 + adcs x1, x7, x1 + ldp x7, x16, [sp, #192] + eor x12, x4, x2 + adcs x4, x15, x12 + ldp x15, x12, [sp, #224] + adc x8, x8, x2 + adds x13, x14, x14 + umulh x14, x5, x10 + adcs x2, x11, x11 + adcs x3, x3, x3 + adcs x1, x1, x1 + adcs x4, x4, x4 + adcs x11, x8, x8 + adc x8, xzr, xzr + adds x13, x13, x7 + adcs x2, x2, x16 + mul x16, x5, x10 + adcs x3, x3, x19 + adcs x1, x1, x20 + umulh x5, x5, x5 + lsl x9, x13, #32 + add x9, x9, x13 + adcs x4, x4, x15 + mov x13, v28.d[1] + adcs x15, x11, x12 + lsr x7, x9, #32 + adc x11, x8, xzr + subs x7, x7, x9 + umulh x10, x10, x10 + sbc x17, x9, xzr + extr x7, x17, x7, #32 + lsr x17, x17, #32 + adds x17, x17, x9 + adc x12, xzr, xzr + subs x8, x2, x7 + sbcs x17, x3, x17 + lsl x7, x8, #32 + sbcs x2, x1, x12 + add x3, x7, x8 + sbcs x12, x4, xzr + lsr x1, x3, #32 + sbcs x7, x15, xzr + sbc x15, x9, xzr + subs x1, x1, x3 + sbc x4, x3, xzr + lsr x9, x4, #32 + extr x8, x4, x1, #32 + adds x9, x9, x3 + adc x4, xzr, xzr + subs x1, x17, x8 + lsl x17, x1, #32 + sbcs x8, x2, x9 + sbcs x9, x12, x4 + add x17, x17, x1 + mov x1, v18.d[1] + lsr x2, x17, #32 + sbcs x7, x7, xzr + mov x12, v18.d[0] + sbcs x15, x15, xzr + sbc x3, x3, xzr + subs x4, x2, x17 + sbc x2, x17, xzr + adds x12, x13, x12 + adcs x16, x16, x1 + lsr x13, x2, #32 + extr x1, x2, x4, #32 + adc x2, x14, xzr + adds x4, x13, x17 + mul x13, x6, x6 + adc x14, xzr, xzr + subs x1, x8, x1 + sbcs x4, x9, x4 + mov x9, v28.d[0] + sbcs x7, x7, x14 + sbcs x8, x15, xzr + sbcs x3, x3, xzr + sbc x14, x17, xzr + adds x17, x9, x9 + adcs x12, x12, x12 + mov x15, v19.d[0] + adcs x9, x16, x16 + umulh x6, x6, x6 + adcs x16, x2, x2 + adc x2, xzr, xzr + adds x11, x11, x8 + adcs x3, x3, xzr + adcs x14, x14, xzr + adcs x8, xzr, xzr + adds x13, x1, x13 + mov x1, v19.d[1] + adcs x6, x4, x6 + mov x4, #0xffffffff // #4294967295 + adcs x15, x7, x15 + adcs x7, x11, x5 + adcs x1, x3, x1 + adcs x14, x14, x10 + adc x11, x8, xzr + adds x6, x6, x17 + adcs x8, x15, x12 + adcs x3, x7, x9 + adcs x15, x1, x16 + mov x16, #0xffffffff00000001 // #-4294967295 + adcs x14, x14, x2 + mov x2, #0x1 // #1 + adc x17, x11, xzr + cmn x13, x16 + adcs xzr, x6, x4 + adcs xzr, x8, x2 + adcs xzr, x3, xzr + adcs xzr, x15, xzr + adcs xzr, x14, xzr + adc x1, x17, xzr + neg x9, x1 + and x1, x16, x9 + adds x19, x13, x1 + and x13, x4, x9 + adcs x20, x6, x13 + and x1, x2, x9 + adcs x7, x8, x1 + adcs x11, x3, xzr + adcs x2, x15, xzr + stp x7, x11, [sp, #208] + adc x17, x14, xzr + stp x2, x17, [sp, #224] + ldp x0, x1, [sp, #288] + mov x6, #0xffffffff // #4294967295 + subs x6, x6, x0 + mov x7, #0xffffffff00000000 // #-4294967296 + sbcs x7, x7, x1 + ldp x0, x1, [sp, #304] + mov x8, #0xfffffffffffffffe // #-2 + sbcs x8, x8, x0 + mov x13, #0xffffffffffffffff // #-1 + sbcs x9, x13, x1 + ldp x0, x1, [sp, #320] + sbcs x10, x13, x0 + sbc x11, x13, x1 + mov x12, #0x9 // #9 + mul x0, x12, x6 + mul x1, x12, x7 + mul x2, x12, x8 + mul x3, x12, x9 + mul x4, x12, x10 + mul x5, x12, x11 + umulh x6, x12, x6 + umulh x7, x12, x7 + umulh x8, x12, x8 + umulh x9, x12, x9 + umulh x10, x12, x10 + umulh x12, x12, x11 + adds x1, x1, x6 + adcs x2, x2, x7 + adcs x3, x3, x8 + adcs x4, x4, x9 + adcs x5, x5, x10 + mov x6, #0x1 // #1 + adc x6, x12, x6 + ldp x8, x9, [sp, #144] + ldp x10, x11, [sp, #160] + ldp x12, x13, [sp, #176] + mov x14, #0xc // #12 + mul x15, x14, x8 + umulh x8, x14, x8 + adds x0, x0, x15 + mul x15, x14, x9 + umulh x9, x14, x9 + adcs x1, x1, x15 + mul x15, x14, x10 + umulh x10, x14, x10 + adcs x2, x2, x15 + mul x15, x14, x11 + umulh x11, x14, x11 + adcs x3, x3, x15 + mul x15, x14, x12 + umulh x12, x14, x12 + adcs x4, x4, x15 + mul x15, x14, x13 + umulh x13, x14, x13 + adcs x5, x5, x15 + adc x6, x6, xzr + adds x1, x1, x8 + adcs x2, x2, x9 + adcs x3, x3, x10 + adcs x4, x4, x11 + adcs x5, x5, x12 + adcs x6, x6, x13 + lsl x7, x6, #32 + subs x8, x6, x7 + sbc x7, x7, xzr + adds x0, x0, x8 + adcs x1, x1, x7 + adcs x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + adcs x5, x5, xzr + csetm x6, cc // cc = lo, ul, last + mov x7, #0xffffffff // #4294967295 + and x7, x7, x6 + adds x0, x0, x7 + eor x7, x7, x6 + adcs x1, x1, x7 + mov x7, #0xfffffffffffffffe // #-2 + and x7, x7, x6 + adcs x2, x2, x7 + adcs x3, x3, x6 + adcs x4, x4, x6 + adc x5, x5, x6 + stp x0, x1, [sp, #288] + stp x2, x3, [sp, #304] + stp x4, x5, [sp, #320] + mov x2, sp + ldp x4, x3, [x2] + subs x5, x19, x4 + sbcs x6, x20, x3 + ldp x7, x8, [sp, #208] + ldp x4, x3, [x2, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [sp, #224] + ldp x4, x3, [x2, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + csetm x3, cc // cc = lo, ul, last + mov x4, #0xffffffff // #4294967295 + and x4, x4, x3 + adds x5, x5, x4 + eor x4, x4, x3 + adcs x6, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + and x4, x4, x3 + adcs x7, x7, x4 + adcs x8, x8, x3 + adcs x9, x9, x3 + adc x10, x10, x3 + stp x5, x6, [sp, #240] + stp x7, x8, [sp, #256] + stp x9, x10, [sp, #272] + ldr q1, [sp, #48] + ldp x9, x2, [sp, #48] + ldr q0, [sp, #48] + ldp x4, x6, [sp, #64] + rev64 v21.4s, v1.4s + uzp2 v28.4s, v1.4s, v1.4s + umulh x7, x9, x2 + xtn v17.2s, v1.2d + mul v27.4s, v21.4s, v0.4s + ldr q20, [sp, #80] + xtn v30.2s, v0.2d + ldr q1, [sp, #80] + uzp2 v31.4s, v0.4s, v0.4s + ldp x5, x10, [sp, #80] + umulh x8, x9, x4 + uaddlp v3.2d, v27.4s + umull v16.2d, v30.2s, v17.2s + mul x16, x9, x4 + umull v27.2d, v30.2s, v28.2s + shrn v0.2s, v20.2d, #32 + xtn v7.2s, v20.2d + shl v20.2d, v3.2d, #32 + umull v3.2d, v31.2s, v28.2s + mul x3, x2, x4 + umlal v20.2d, v30.2s, v17.2s + umull v22.2d, v7.2s, v0.2s + usra v27.2d, v16.2d, #32 + umulh x11, x2, x4 + movi v21.2d, #0xffffffff + uzp2 v28.4s, v1.4s, v1.4s + adds x15, x16, x7 + and v5.16b, v27.16b, v21.16b + adcs x3, x3, x8 + usra v3.2d, v27.2d, #32 + dup v29.2d, x6 + adcs x16, x11, xzr + mov x14, v20.d[0] + umlal v5.2d, v31.2s, v17.2s + mul x8, x9, x2 + mov x7, v20.d[1] + shl v19.2d, v22.2d, #33 + xtn v25.2s, v29.2d + rev64 v31.4s, v1.4s + lsl x13, x14, #32 + uzp2 v6.4s, v29.4s, v29.4s + umlal v19.2d, v7.2s, v7.2s + usra v3.2d, v5.2d, #32 + adds x1, x8, x8 + umulh x8, x4, x4 + add x12, x13, x14 + mul v17.4s, v31.4s, v29.4s + xtn v4.2s, v1.2d + adcs x14, x15, x15 + lsr x13, x12, #32 + adcs x15, x3, x3 + umull v31.2d, v25.2s, v28.2s + adcs x11, x16, x16 + umull v21.2d, v25.2s, v4.2s + mov x17, v3.d[0] + umull v18.2d, v6.2s, v28.2s + adc x16, x8, xzr + uaddlp v16.2d, v17.4s + movi v1.2d, #0xffffffff + subs x13, x13, x12 + usra v31.2d, v21.2d, #32 + sbc x8, x12, xzr + adds x17, x17, x1 + mul x1, x4, x4 + shl v28.2d, v16.2d, #32 + mov x3, v3.d[1] + adcs x14, x7, x14 + extr x7, x8, x13, #32 + adcs x13, x3, x15 + and v3.16b, v31.16b, v1.16b + adcs x11, x1, x11 + lsr x1, x8, #32 + umlal v3.2d, v6.2s, v4.2s + usra v18.2d, v31.2d, #32 + adc x3, x16, xzr + adds x1, x1, x12 + umlal v28.2d, v25.2s, v4.2s + adc x16, xzr, xzr + subs x15, x17, x7 + sbcs x7, x14, x1 + lsl x1, x15, #32 + sbcs x16, x13, x16 + add x8, x1, x15 + usra v18.2d, v3.2d, #32 + sbcs x14, x11, xzr + lsr x1, x8, #32 + sbcs x17, x3, xzr + sbc x11, x12, xzr + subs x13, x1, x8 + umulh x12, x4, x10 + sbc x1, x8, xzr + extr x13, x1, x13, #32 + lsr x1, x1, #32 + adds x15, x1, x8 + adc x1, xzr, xzr + subs x7, x7, x13 + sbcs x13, x16, x15 + lsl x3, x7, #32 + umulh x16, x2, x5 + sbcs x15, x14, x1 + add x7, x3, x7 + sbcs x3, x17, xzr + lsr x1, x7, #32 + sbcs x14, x11, xzr + sbc x11, x8, xzr + subs x8, x1, x7 + sbc x1, x7, xzr + extr x8, x1, x8, #32 + lsr x1, x1, #32 + adds x1, x1, x7 + adc x17, xzr, xzr + subs x13, x13, x8 + umulh x8, x9, x6 + sbcs x1, x15, x1 + sbcs x19, x3, x17 + sbcs x20, x14, xzr + mul x17, x2, x5 + sbcs x11, x11, xzr + stp x13, x1, [sp, #192] + sbc x14, x7, xzr + mul x7, x4, x10 + subs x1, x9, x2 + csetm x15, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + stp x11, x14, [sp, #224] + mul x14, x9, x6 + adds x17, x8, x17 + adcs x7, x16, x7 + adc x13, x12, xzr + subs x12, x5, x6 + cneg x3, x12, cc // cc = lo, ul, last + cinv x16, x15, cc // cc = lo, ul, last + mul x8, x1, x3 + umulh x1, x1, x3 + eor x12, x8, x16 + adds x11, x17, x14 + adcs x3, x7, x17 + adcs x15, x13, x7 + adc x8, x13, xzr + adds x3, x3, x14 + adcs x15, x15, x17 + adcs x17, x8, x7 + eor x1, x1, x16 + adc x13, x13, xzr + subs x9, x9, x4 + csetm x8, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x4, x2, x4 + cneg x4, x4, cc // cc = lo, ul, last + csetm x7, cc // cc = lo, ul, last + subs x2, x10, x6 + cinv x8, x8, cc // cc = lo, ul, last + cneg x2, x2, cc // cc = lo, ul, last + cmn x16, #0x1 + adcs x11, x11, x12 + mul x12, x9, x2 + adcs x3, x3, x1 + adcs x15, x15, x16 + umulh x9, x9, x2 + adcs x17, x17, x16 + adc x13, x13, x16 + subs x1, x10, x5 + cinv x2, x7, cc // cc = lo, ul, last + cneg x1, x1, cc // cc = lo, ul, last + eor x9, x9, x8 + cmn x8, #0x1 + eor x7, x12, x8 + mul x12, x4, x1 + adcs x3, x3, x7 + adcs x7, x15, x9 + adcs x15, x17, x8 + umulh x4, x4, x1 + adc x8, x13, x8 + cmn x2, #0x1 + eor x1, x12, x2 + adcs x1, x7, x1 + ldp x7, x16, [sp, #192] + eor x12, x4, x2 + adcs x4, x15, x12 + ldp x15, x12, [sp, #224] + adc x8, x8, x2 + adds x13, x14, x14 + umulh x14, x5, x10 + adcs x2, x11, x11 + adcs x3, x3, x3 + adcs x1, x1, x1 + adcs x4, x4, x4 + adcs x11, x8, x8 + adc x8, xzr, xzr + adds x13, x13, x7 + adcs x2, x2, x16 + mul x16, x5, x10 + adcs x3, x3, x19 + adcs x1, x1, x20 + umulh x5, x5, x5 + lsl x9, x13, #32 + add x9, x9, x13 + adcs x4, x4, x15 + mov x13, v28.d[1] + adcs x15, x11, x12 + lsr x7, x9, #32 + adc x11, x8, xzr + subs x7, x7, x9 + umulh x10, x10, x10 + sbc x17, x9, xzr + extr x7, x17, x7, #32 + lsr x17, x17, #32 + adds x17, x17, x9 + adc x12, xzr, xzr + subs x8, x2, x7 + sbcs x17, x3, x17 + lsl x7, x8, #32 + sbcs x2, x1, x12 + add x3, x7, x8 + sbcs x12, x4, xzr + lsr x1, x3, #32 + sbcs x7, x15, xzr + sbc x15, x9, xzr + subs x1, x1, x3 + sbc x4, x3, xzr + lsr x9, x4, #32 + extr x8, x4, x1, #32 + adds x9, x9, x3 + adc x4, xzr, xzr + subs x1, x17, x8 + lsl x17, x1, #32 + sbcs x8, x2, x9 + sbcs x9, x12, x4 + add x17, x17, x1 + mov x1, v18.d[1] + lsr x2, x17, #32 + sbcs x7, x7, xzr + mov x12, v18.d[0] + sbcs x15, x15, xzr + sbc x3, x3, xzr + subs x4, x2, x17 + sbc x2, x17, xzr + adds x12, x13, x12 + adcs x16, x16, x1 + lsr x13, x2, #32 + extr x1, x2, x4, #32 + adc x2, x14, xzr + adds x4, x13, x17 + mul x13, x6, x6 + adc x14, xzr, xzr + subs x1, x8, x1 + sbcs x4, x9, x4 + mov x9, v28.d[0] + sbcs x7, x7, x14 + sbcs x8, x15, xzr + sbcs x3, x3, xzr + sbc x14, x17, xzr + adds x17, x9, x9 + adcs x12, x12, x12 + mov x15, v19.d[0] + adcs x9, x16, x16 + umulh x6, x6, x6 + adcs x16, x2, x2 + adc x2, xzr, xzr + adds x11, x11, x8 + adcs x3, x3, xzr + adcs x14, x14, xzr + adcs x8, xzr, xzr + adds x13, x1, x13 + mov x1, v19.d[1] + adcs x6, x4, x6 + mov x4, #0xffffffff // #4294967295 + adcs x15, x7, x15 + adcs x7, x11, x5 + adcs x1, x3, x1 + adcs x14, x14, x10 + adc x11, x8, xzr + adds x6, x6, x17 + adcs x8, x15, x12 + adcs x3, x7, x9 + adcs x15, x1, x16 + mov x16, #0xffffffff00000001 // #-4294967295 + adcs x14, x14, x2 + mov x2, #0x1 // #1 + adc x17, x11, xzr + cmn x13, x16 + adcs xzr, x6, x4 + adcs xzr, x8, x2 + adcs xzr, x3, xzr + adcs xzr, x15, xzr + adcs xzr, x14, xzr + adc x1, x17, xzr + neg x9, x1 + and x1, x16, x9 + adds x11, x13, x1 + and x13, x4, x9 + adcs x5, x6, x13 + and x1, x2, x9 + adcs x7, x8, x1 + stp x11, x5, [sp, #192] + adcs x11, x3, xzr + adcs x2, x15, xzr + stp x7, x11, [sp, #208] + adc x17, x14, xzr + stp x2, x17, [sp, #224] + ldp x5, x6, [sp, #240] + ldp x4, x3, [sp, #48] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #256] + ldp x4, x3, [sp, #64] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [sp, #272] + ldp x4, x3, [sp, #80] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + csetm x3, cc // cc = lo, ul, last + mov x4, #0xffffffff // #4294967295 + and x4, x4, x3 + adds x5, x5, x4 + eor x4, x4, x3 + adcs x6, x6, x4 + mov x4, #0xfffffffffffffffe // #-2 + and x4, x4, x3 + adcs x7, x7, x4 + adcs x8, x8, x3 + adcs x9, x9, x3 + adc x10, x10, x3 + stp x5, x6, [x25, #96] + stp x7, x8, [x25, #112] + stp x9, x10, [x25, #128] + ldr q3, [sp, #288] + ldr q25, [sp, #96] + ldp x13, x23, [sp, #96] + ldp x3, x21, [sp, #288] + rev64 v23.4s, v25.4s + uzp1 v17.4s, v25.4s, v3.4s + umulh x15, x3, x13 + mul v6.4s, v23.4s, v3.4s + uzp1 v3.4s, v3.4s, v3.4s + ldr q27, [sp, #128] + ldp x8, x24, [sp, #304] + subs x6, x3, x21 + ldr q0, [sp, #320] + movi v23.2d, #0xffffffff + csetm x10, cc // cc = lo, ul, last + umulh x19, x21, x23 + rev64 v4.4s, v27.4s + uzp2 v25.4s, v27.4s, v27.4s + cneg x4, x6, cc // cc = lo, ul, last + subs x7, x23, x13 + xtn v22.2s, v0.2d + xtn v24.2s, v27.2d + cneg x20, x7, cc // cc = lo, ul, last + ldp x6, x14, [sp, #112] + mul v27.4s, v4.4s, v0.4s + uaddlp v20.2d, v6.4s + cinv x5, x10, cc // cc = lo, ul, last + mul x16, x4, x20 + uzp2 v6.4s, v0.4s, v0.4s + umull v21.2d, v22.2s, v25.2s + shl v0.2d, v20.2d, #32 + umlal v0.2d, v3.2s, v17.2s + mul x22, x8, x6 + umull v1.2d, v6.2s, v25.2s + subs x12, x3, x8 + umull v20.2d, v22.2s, v24.2s + cneg x17, x12, cc // cc = lo, ul, last + umulh x9, x8, x6 + mov x12, v0.d[1] + eor x11, x16, x5 + mov x7, v0.d[0] + csetm x10, cc // cc = lo, ul, last + usra v21.2d, v20.2d, #32 + adds x15, x15, x12 + adcs x12, x19, x22 + umulh x20, x4, x20 + adc x19, x9, xzr + usra v1.2d, v21.2d, #32 + adds x22, x15, x7 + and v26.16b, v21.16b, v23.16b + adcs x16, x12, x15 + uaddlp v25.2d, v27.4s + adcs x9, x19, x12 + umlal v26.2d, v6.2s, v24.2s + adc x4, x19, xzr + adds x16, x16, x7 + shl v27.2d, v25.2d, #32 + adcs x9, x9, x15 + adcs x4, x4, x12 + eor x12, x20, x5 + adc x15, x19, xzr + subs x20, x6, x13 + cneg x20, x20, cc // cc = lo, ul, last + cinv x10, x10, cc // cc = lo, ul, last + cmn x5, #0x1 + mul x19, x17, x20 + adcs x11, x22, x11 + adcs x12, x16, x12 + adcs x9, x9, x5 + umulh x17, x17, x20 + adcs x22, x4, x5 + adc x5, x15, x5 + subs x16, x21, x8 + cneg x20, x16, cc // cc = lo, ul, last + eor x19, x19, x10 + csetm x4, cc // cc = lo, ul, last + subs x16, x6, x23 + cneg x16, x16, cc // cc = lo, ul, last + umlal v27.2d, v22.2s, v24.2s + mul x15, x20, x16 + cinv x4, x4, cc // cc = lo, ul, last + cmn x10, #0x1 + usra v1.2d, v26.2d, #32 + adcs x19, x12, x19 + eor x17, x17, x10 + adcs x9, x9, x17 + adcs x22, x22, x10 + lsl x12, x7, #32 + umulh x20, x20, x16 + eor x16, x15, x4 + ldp x15, x17, [sp, #128] + add x2, x12, x7 + adc x7, x5, x10 + ldp x5, x10, [sp, #320] + lsr x1, x2, #32 + eor x12, x20, x4 + subs x1, x1, x2 + sbc x20, x2, xzr + cmn x4, #0x1 + adcs x9, x9, x16 + extr x1, x20, x1, #32 + lsr x20, x20, #32 + adcs x22, x22, x12 + adc x16, x7, x4 + adds x12, x20, x2 + umulh x7, x24, x14 + adc x4, xzr, xzr + subs x1, x11, x1 + sbcs x20, x19, x12 + sbcs x12, x9, x4 + lsl x9, x1, #32 + add x1, x9, x1 + sbcs x9, x22, xzr + mul x22, x24, x14 + sbcs x16, x16, xzr + lsr x4, x1, #32 + sbc x19, x2, xzr + subs x4, x4, x1 + sbc x11, x1, xzr + extr x2, x11, x4, #32 + lsr x4, x11, #32 + adds x4, x4, x1 + adc x11, xzr, xzr + subs x2, x20, x2 + sbcs x4, x12, x4 + sbcs x20, x9, x11 + lsl x12, x2, #32 + add x2, x12, x2 + sbcs x9, x16, xzr + lsr x11, x2, #32 + sbcs x19, x19, xzr + sbc x1, x1, xzr + subs x16, x11, x2 + sbc x12, x2, xzr + extr x16, x12, x16, #32 + lsr x12, x12, #32 + adds x11, x12, x2 + adc x12, xzr, xzr + subs x26, x4, x16 + mov x4, v27.d[0] + sbcs x27, x20, x11 + sbcs x20, x9, x12 + sbcs x11, x19, xzr + sbcs x9, x1, xzr + stp x20, x11, [sp, #256] + mov x1, v1.d[0] + sbc x20, x2, xzr + subs x12, x24, x5 + mov x11, v27.d[1] + cneg x16, x12, cc // cc = lo, ul, last + csetm x2, cc // cc = lo, ul, last + subs x19, x15, x14 + mov x12, v1.d[1] + cinv x2, x2, cc // cc = lo, ul, last + cneg x19, x19, cc // cc = lo, ul, last + stp x9, x20, [sp, #272] + mul x9, x16, x19 + adds x4, x7, x4 + adcs x11, x1, x11 + adc x1, x12, xzr + adds x20, x4, x22 + umulh x19, x16, x19 + adcs x7, x11, x4 + eor x16, x9, x2 + adcs x9, x1, x11 + adc x12, x1, xzr + adds x7, x7, x22 + adcs x4, x9, x4 + adcs x9, x12, x11 + adc x12, x1, xzr + cmn x2, #0x1 + eor x1, x19, x2 + adcs x11, x20, x16 + adcs x19, x7, x1 + adcs x1, x4, x2 + adcs x20, x9, x2 + adc x2, x12, x2 + subs x12, x24, x10 + cneg x16, x12, cc // cc = lo, ul, last + csetm x12, cc // cc = lo, ul, last + subs x9, x17, x14 + cinv x12, x12, cc // cc = lo, ul, last + cneg x9, x9, cc // cc = lo, ul, last + subs x3, x24, x3 + sbcs x21, x5, x21 + mul x24, x16, x9 + sbcs x4, x10, x8 + ngc x8, xzr + subs x10, x5, x10 + eor x5, x24, x12 + csetm x7, cc // cc = lo, ul, last + cneg x24, x10, cc // cc = lo, ul, last + subs x10, x17, x15 + cinv x7, x7, cc // cc = lo, ul, last + cneg x10, x10, cc // cc = lo, ul, last + subs x14, x13, x14 + sbcs x15, x23, x15 + eor x13, x21, x8 + mul x23, x24, x10 + sbcs x17, x6, x17 + eor x6, x3, x8 + ngc x21, xzr + umulh x9, x16, x9 + cmn x8, #0x1 + eor x3, x23, x7 + adcs x23, x6, xzr + adcs x13, x13, xzr + eor x16, x4, x8 + adc x16, x16, xzr + eor x4, x17, x21 + umulh x17, x24, x10 + cmn x21, #0x1 + eor x24, x14, x21 + eor x6, x15, x21 + adcs x15, x24, xzr + adcs x14, x6, xzr + adc x6, x4, xzr + cmn x12, #0x1 + eor x4, x9, x12 + adcs x19, x19, x5 + umulh x5, x23, x15 + adcs x1, x1, x4 + adcs x10, x20, x12 + eor x4, x17, x7 + adc x2, x2, x12 + cmn x7, #0x1 + adcs x12, x1, x3 + ldp x17, x24, [sp, #256] + mul x1, x16, x6 + adcs x3, x10, x4 + adc x2, x2, x7 + ldp x7, x4, [sp, #272] + adds x20, x22, x26 + mul x10, x13, x14 + adcs x11, x11, x27 + eor x9, x8, x21 + adcs x26, x19, x17 + stp x20, x11, [sp, #240] + adcs x27, x12, x24 + mul x8, x23, x15 + adcs x3, x3, x7 + adcs x12, x2, x4 + adc x19, xzr, xzr + subs x21, x23, x16 + umulh x2, x16, x6 + stp x3, x12, [sp, #272] + cneg x3, x21, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + umulh x11, x13, x14 + subs x21, x13, x16 + eor x7, x8, x9 + cneg x17, x21, cc // cc = lo, ul, last + csetm x16, cc // cc = lo, ul, last + subs x21, x6, x15 + cneg x22, x21, cc // cc = lo, ul, last + cinv x21, x24, cc // cc = lo, ul, last + subs x20, x23, x13 + umulh x12, x3, x22 + cneg x23, x20, cc // cc = lo, ul, last + csetm x24, cc // cc = lo, ul, last + subs x20, x14, x15 + cinv x24, x24, cc // cc = lo, ul, last + mul x22, x3, x22 + cneg x3, x20, cc // cc = lo, ul, last + subs x13, x6, x14 + cneg x20, x13, cc // cc = lo, ul, last + cinv x15, x16, cc // cc = lo, ul, last + adds x13, x5, x10 + mul x4, x23, x3 + adcs x11, x11, x1 + adc x14, x2, xzr + adds x5, x13, x8 + adcs x16, x11, x13 + umulh x23, x23, x3 + adcs x3, x14, x11 + adc x1, x14, xzr + adds x10, x16, x8 + adcs x6, x3, x13 + adcs x8, x1, x11 + umulh x13, x17, x20 + eor x1, x4, x24 + adc x4, x14, xzr + cmn x24, #0x1 + adcs x1, x5, x1 + eor x16, x23, x24 + eor x11, x1, x9 + adcs x23, x10, x16 + eor x2, x22, x21 + adcs x3, x6, x24 + mul x14, x17, x20 + eor x17, x13, x15 + adcs x13, x8, x24 + adc x8, x4, x24 + cmn x21, #0x1 + adcs x6, x23, x2 + mov x16, #0xfffffffffffffffe // #-2 + eor x20, x12, x21 + adcs x20, x3, x20 + eor x23, x14, x15 + adcs x2, x13, x21 + adc x8, x8, x21 + cmn x15, #0x1 + ldp x5, x4, [sp, #240] + adcs x22, x20, x23 + eor x23, x22, x9 + adcs x17, x2, x17 + adc x22, x8, x15 + cmn x9, #0x1 + adcs x15, x7, x5 + ldp x10, x14, [sp, #272] + eor x1, x6, x9 + lsl x2, x15, #32 + adcs x8, x11, x4 + adcs x13, x1, x26 + eor x1, x22, x9 + adcs x24, x23, x27 + eor x11, x17, x9 + adcs x23, x11, x10 + adcs x7, x1, x14 + adcs x17, x9, x19 + adcs x20, x9, xzr + add x1, x2, x15 + lsr x3, x1, #32 + adcs x11, x9, xzr + adc x9, x9, xzr + subs x3, x3, x1 + sbc x6, x1, xzr + adds x24, x24, x5 + adcs x4, x23, x4 + extr x3, x6, x3, #32 + lsr x6, x6, #32 + adcs x21, x7, x26 + adcs x15, x17, x27 + adcs x7, x20, x10 + adcs x20, x11, x14 + mov x14, #0xffffffff // #4294967295 + adc x22, x9, x19 + adds x12, x6, x1 + adc x10, xzr, xzr + subs x3, x8, x3 + sbcs x12, x13, x12 + lsl x9, x3, #32 + add x3, x9, x3 + sbcs x10, x24, x10 + sbcs x24, x4, xzr + lsr x9, x3, #32 + sbcs x21, x21, xzr + sbc x1, x1, xzr + subs x9, x9, x3 + sbc x13, x3, xzr + extr x9, x13, x9, #32 + lsr x13, x13, #32 + adds x13, x13, x3 + adc x6, xzr, xzr + subs x12, x12, x9 + sbcs x17, x10, x13 + lsl x2, x12, #32 + sbcs x10, x24, x6 + add x9, x2, x12 + sbcs x6, x21, xzr + lsr x5, x9, #32 + sbcs x21, x1, xzr + sbc x13, x3, xzr + subs x8, x5, x9 + sbc x19, x9, xzr + lsr x12, x19, #32 + extr x3, x19, x8, #32 + adds x8, x12, x9 + adc x1, xzr, xzr + subs x2, x17, x3 + sbcs x12, x10, x8 + sbcs x5, x6, x1 + sbcs x3, x21, xzr + sbcs x19, x13, xzr + sbc x24, x9, xzr + adds x23, x15, x3 + adcs x8, x7, x19 + adcs x11, x20, x24 + adc x9, x22, xzr + add x24, x9, #0x1 + lsl x7, x24, #32 + subs x21, x24, x7 + sbc x10, x7, xzr + adds x6, x2, x21 + adcs x7, x12, x10 + adcs x24, x5, x24 + adcs x13, x23, xzr + adcs x8, x8, xzr + adcs x15, x11, xzr + csetm x23, cc // cc = lo, ul, last + and x11, x16, x23 + and x20, x14, x23 + adds x22, x6, x20 + eor x3, x20, x23 + adcs x5, x7, x3 + adcs x14, x24, x11 + stp x22, x5, [sp, #240] + adcs x5, x13, x23 + adcs x12, x8, x23 + stp x14, x5, [sp, #256] + adc x19, x15, x23 + ldp x1, x2, [sp, #144] + ldp x3, x4, [sp, #160] + ldp x5, x6, [sp, #176] + lsl x0, x1, #2 + ldp x7, x8, [sp, #288] + subs x0, x0, x7 + extr x1, x2, x1, #62 + sbcs x1, x1, x8 + ldp x7, x8, [sp, #304] + extr x2, x3, x2, #62 + sbcs x2, x2, x7 + extr x3, x4, x3, #62 + sbcs x3, x3, x8 + extr x4, x5, x4, #62 + ldp x7, x8, [sp, #320] + sbcs x4, x4, x7 + extr x5, x6, x5, #62 + sbcs x5, x5, x8 + lsr x6, x6, #62 + adc x6, x6, xzr + lsl x7, x6, #32 + subs x8, x6, x7 + sbc x7, x7, xzr + adds x0, x0, x8 + adcs x1, x1, x7 + adcs x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + adcs x5, x5, xzr + csetm x8, cc // cc = lo, ul, last + mov x9, #0xffffffff // #4294967295 + and x9, x9, x8 + adds x0, x0, x9 + eor x9, x9, x8 + adcs x1, x1, x9 + mov x9, #0xfffffffffffffffe // #-2 + and x9, x9, x8 + adcs x2, x2, x9 + adcs x3, x3, x8 + adcs x4, x4, x8 + adc x5, x5, x8 + stp x0, x1, [x25] + stp x2, x3, [x25, #16] + stp x4, x5, [x25, #32] + ldp x0, x1, [sp, #192] + mov x6, #0xffffffff // #4294967295 + subs x6, x6, x0 + mov x7, #0xffffffff00000000 // #-4294967296 + sbcs x7, x7, x1 + ldp x0, x1, [sp, #208] + mov x8, #0xfffffffffffffffe // #-2 + sbcs x8, x8, x0 + mov x13, #0xffffffffffffffff // #-1 + sbcs x9, x13, x1 + ldp x0, x1, [sp, #224] + sbcs x10, x13, x0 + sbc x11, x13, x1 + lsl x0, x6, #3 + extr x1, x7, x6, #61 + extr x2, x8, x7, #61 + extr x3, x9, x8, #61 + extr x4, x10, x9, #61 + extr x5, x11, x10, #61 + lsr x6, x11, #61 + add x6, x6, #0x1 + ldp x8, x9, [sp, #240] + ldp x10, x11, [sp, #256] + mov x14, #0x3 // #3 + mul x15, x14, x8 + umulh x8, x14, x8 + adds x0, x0, x15 + mul x15, x14, x9 + umulh x9, x14, x9 + adcs x1, x1, x15 + mul x15, x14, x10 + umulh x10, x14, x10 + adcs x2, x2, x15 + mul x15, x14, x11 + umulh x11, x14, x11 + adcs x3, x3, x15 + mul x15, x14, x12 + umulh x12, x14, x12 + adcs x4, x4, x15 + mul x15, x14, x19 + umulh x13, x14, x19 + adcs x5, x5, x15 + adc x6, x6, xzr + adds x1, x1, x8 + adcs x2, x2, x9 + adcs x3, x3, x10 + adcs x4, x4, x11 + adcs x5, x5, x12 + adcs x6, x6, x13 + lsl x7, x6, #32 + subs x8, x6, x7 + sbc x7, x7, xzr + adds x0, x0, x8 + adcs x1, x1, x7 + adcs x2, x2, x6 + adcs x3, x3, xzr + adcs x4, x4, xzr + adcs x5, x5, xzr + csetm x6, cc // cc = lo, ul, last + mov x7, #0xffffffff // #4294967295 + and x7, x7, x6 + adds x0, x0, x7 + eor x7, x7, x6 + adcs x1, x1, x7 + mov x7, #0xfffffffffffffffe // #-2 + and x7, x7, x6 + adcs x2, x2, x7 + adcs x3, x3, x6 + adcs x4, x4, x6 + adc x5, x5, x6 + stp x0, x1, [x25, #48] + stp x2, x3, [x25, #64] + stp x4, x5, [x25, #80] + ldp x19, x20, [sp, #336] + ldp x21, x22, [sp, #352] + ldp x23, x24, [sp, #368] + ldp x25, x26, [sp, #384] + ldp x27, xzr, [sp, #400] + add sp, sp, #0x1a0 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p384/p384_montjscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjscalarmul_alt.S similarity index 99% rename from third_party/s2n-bignum/arm/p384/p384_montjscalarmul_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjscalarmul_alt.S index 9f47090a8c0..5dfba9c862c 100644 --- a/third_party/s2n-bignum/arm/p384/p384_montjscalarmul_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/p384_montjscalarmul_alt.S @@ -61,42 +61,42 @@ // which doesn't accept repetitions, assembler macros etc. #define selectblock(I) \ - cmp bf, #(1*I); \ - ldp x20, x21, [x19]; \ - csel x0, x20, x0, eq; \ - csel x1, x21, x1, eq; \ - ldp x20, x21, [x19, #16]; \ - csel x2, x20, x2, eq; \ - csel x3, x21, x3, eq; \ - ldp x20, x21, [x19, #32]; \ - csel x4, x20, x4, eq; \ - csel x5, x21, x5, eq; \ - ldp x20, x21, [x19, #48]; \ - csel x6, x20, x6, eq; \ - csel x7, x21, x7, eq; \ - ldp x20, x21, [x19, #64]; \ - csel x8, x20, x8, eq; \ - csel x9, x21, x9, eq; \ - ldp x20, x21, [x19, #80]; \ - csel x10, x20, x10, eq; \ - csel x11, x21, x11, eq; \ - ldp x20, x21, [x19, #96]; \ - csel x12, x20, x12, eq; \ - csel x13, x21, x13, eq; \ - ldp x20, x21, [x19, #112]; \ - csel x14, x20, x14, eq; \ - csel x15, x21, x15, eq; \ - ldp x20, x21, [x19, #128]; \ - csel x16, x20, x16, eq; \ - csel x17, x21, x17, eq; \ + cmp bf, #(1*I) __LF \ + ldp x20, x21, [x19] __LF \ + csel x0, x20, x0, eq __LF \ + csel x1, x21, x1, eq __LF \ + ldp x20, x21, [x19, #16] __LF \ + csel x2, x20, x2, eq __LF \ + csel x3, x21, x3, eq __LF \ + ldp x20, x21, [x19, #32] __LF \ + csel x4, x20, x4, eq __LF \ + csel x5, x21, x5, eq __LF \ + ldp x20, x21, [x19, #48] __LF \ + csel x6, x20, x6, eq __LF \ + csel x7, x21, x7, eq __LF \ + ldp x20, x21, [x19, #64] __LF \ + csel x8, x20, x8, eq __LF \ + csel x9, x21, x9, eq __LF \ + ldp x20, x21, [x19, #80] __LF \ + csel x10, x20, x10, eq __LF \ + csel x11, x21, x11, eq __LF \ + ldp x20, x21, [x19, #96] __LF \ + csel x12, x20, x12, eq __LF \ + csel x13, x21, x13, eq __LF \ + ldp x20, x21, [x19, #112] __LF \ + csel x14, x20, x14, eq __LF \ + csel x15, x21, x15, eq __LF \ + ldp x20, x21, [x19, #128] __LF \ + csel x16, x20, x16, eq __LF \ + csel x17, x21, x17, eq __LF \ add x19, x19, #JACSIZE // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 S2N_BN_SYMBOL(p384_montjscalarmul_alt): diff --git a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/bignum_montmul_p384_base.S similarity index 87% rename from third_party/s2n-bignum/arm/p384/bignum_montmul_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/bignum_montmul_p384_base.S index 05c3d1786a8..cda6a1571b0 100644 --- a/third_party/s2n-bignum/arm/p384/bignum_montmul_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/bignum_montmul_p384_base.S @@ -5,7 +5,7 @@ // Montgomery multiply, z := (x * y / 2^384) mod p_384 // Inputs x[6], y[6]; output z[6] // -// extern void bignum_montmul_p384 +// extern void bignum_montmul_p384_base // (uint64_t z[static 6], uint64_t x[static 6], uint64_t y[static 6]); // // Does z := (2^{-384} * x * y) mod p_384, assuming that the inputs x and y @@ -16,8 +16,8 @@ // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p384) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p384) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p384_base) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p384_base) .text .balign 4 @@ -28,15 +28,15 @@ // --------------------------------------------------------------------------- #define muldiffn(c,h,l, t, x,y, w,z) \ - subs t, x, y; \ - cneg t, t, cc; \ - csetm c, cc; \ - subs h, w, z; \ - cneg h, h, cc; \ - mul l, t, h; \ - umulh h, t, h; \ - cinv c, c, cc; \ - eor l, l, c; \ + subs t, x, y __LF \ + cneg t, t, cc __LF \ + csetm c, cc __LF \ + subs h, w, z __LF \ + cneg h, h, cc __LF \ + mul l, t, h __LF \ + umulh h, t, h __LF \ + cinv c, c, cc __LF \ + eor l, l, c __LF \ eor h, h, c // --------------------------------------------------------------------------- @@ -52,27 +52,27 @@ #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ /* Recycle d0 (which we know gets implicitly cancelled) to store it */ \ - lsl t1, d0, #32; \ - add d0, t1, d0; \ + lsl t1, d0, #32 __LF \ + add d0, t1, d0 __LF \ /* Now let [t2;t1] = 2^64 * w - w + w_hi where w_hi = floor(w/2^32) */ \ /* We need to subtract 2^32 * this, and we can ignore its lower 32 */ \ /* bits since by design it will cancel anyway; we only need the w_hi */ \ /* part to get the carry propagation going. */ \ - lsr t1, d0, #32; \ - subs t1, t1, d0; \ - sbc t2, d0, xzr; \ + lsr t1, d0, #32 __LF \ + subs t1, t1, d0 __LF \ + sbc t2, d0, xzr __LF \ /* Now select in t1 the field to subtract from d1 */ \ - extr t1, t2, t1, #32; \ + extr t1, t2, t1, #32 __LF \ /* And now get the terms to subtract from d2 and d3 */ \ - lsr t2, t2, #32; \ - adds t2, t2, d0; \ - adc t3, xzr, xzr; \ + lsr t2, t2, #32 __LF \ + adds t2, t2, d0 __LF \ + adc t3, xzr, xzr __LF \ /* Do the subtraction of that portion */ \ - subs d1, d1, t1; \ - sbcs d2, d2, t2; \ - sbcs d3, d3, t3; \ - sbcs d4, d4, xzr; \ - sbcs d5, d5, xzr; \ + subs d1, d1, t1 __LF \ + sbcs d2, d2, t2 __LF \ + sbcs d3, d3, t3 __LF \ + sbcs d4, d4, xzr __LF \ + sbcs d5, d5, xzr __LF \ /* Now effectively add 2^384 * w by taking d0 as the input for last sbc */ \ sbc d6, d0, xzr @@ -102,7 +102,7 @@ #define t3 x23 #define t4 x24 -S2N_BN_SYMBOL(bignum_montmul_p384): +S2N_BN_SYMBOL(bignum_montmul_p384_base): // Save some registers diff --git a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/bignum_montsqr_p384_base.S similarity index 84% rename from third_party/s2n-bignum/arm/p384/bignum_montsqr_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/bignum_montsqr_p384_base.S index fd55c1bf029..410ae8f4a89 100644 --- a/third_party/s2n-bignum/arm/p384/bignum_montsqr_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/bignum_montsqr_p384_base.S @@ -5,7 +5,7 @@ // Montgomery square, z := (x^2 / 2^384) mod p_384 // Input x[6]; output z[6] // -// extern void bignum_montsqr_p384 +// extern void bignum_montsqr_p384_base // (uint64_t z[static 6], uint64_t x[static 6]); // // Does z := (x^2 / 2^384) mod p_384, assuming x^2 <= 2^384 * p_384, which is @@ -15,8 +15,8 @@ // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p384) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p384) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p384_base) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p384_base) .text .balign 4 @@ -27,15 +27,15 @@ // --------------------------------------------------------------------------- #define muldiffn(c,h,l, t, x,y, w,z) \ - subs t, x, y; \ - cneg t, t, cc; \ - csetm c, cc; \ - subs h, w, z; \ - cneg h, h, cc; \ - mul l, t, h; \ - umulh h, t, h; \ - cinv c, c, cc; \ - eor l, l, c; \ + subs t, x, y __LF \ + cneg t, t, cc __LF \ + csetm c, cc __LF \ + subs h, w, z __LF \ + cneg h, h, cc __LF \ + mul l, t, h __LF \ + umulh h, t, h __LF \ + cinv c, c, cc __LF \ + eor l, l, c __LF \ eor h, h, c // --------------------------------------------------------------------------- @@ -51,27 +51,27 @@ #define montreds(d6,d5,d4,d3,d2,d1,d0, t3,t2,t1) \ /* Our correction multiplier is w = [d0 + (d0<<32)] mod 2^64 */ \ /* Recycle d0 (which we know gets implicitly cancelled) to store it */ \ - lsl t1, d0, #32; \ - add d0, t1, d0; \ + lsl t1, d0, #32 __LF \ + add d0, t1, d0 __LF \ /* Now let [t2;t1] = 2^64 * w - w + w_hi where w_hi = floor(w/2^32) */ \ /* We need to subtract 2^32 * this, and we can ignore its lower 32 */ \ /* bits since by design it will cancel anyway; we only need the w_hi */ \ /* part to get the carry propagation going. */ \ - lsr t1, d0, #32; \ - subs t1, t1, d0; \ - sbc t2, d0, xzr; \ + lsr t1, d0, #32 __LF \ + subs t1, t1, d0 __LF \ + sbc t2, d0, xzr __LF \ /* Now select in t1 the field to subtract from d1 */ \ - extr t1, t2, t1, #32; \ + extr t1, t2, t1, #32 __LF \ /* And now get the terms to subtract from d2 and d3 */ \ - lsr t2, t2, #32; \ - adds t2, t2, d0; \ - adc t3, xzr, xzr; \ + lsr t2, t2, #32 __LF \ + adds t2, t2, d0 __LF \ + adc t3, xzr, xzr __LF \ /* Do the subtraction of that portion */ \ - subs d1, d1, t1; \ - sbcs d2, d2, t2; \ - sbcs d3, d3, t3; \ - sbcs d4, d4, xzr; \ - sbcs d5, d5, xzr; \ + subs d1, d1, t1 __LF \ + sbcs d2, d2, t2 __LF \ + sbcs d3, d3, t3 __LF \ + sbcs d4, d4, xzr __LF \ + sbcs d5, d5, xzr __LF \ /* Now effectively add 2^384 * w by taking d0 as the input for last sbc */ \ sbc d6, d0, xzr @@ -93,7 +93,7 @@ #define d3 x16 #define d4 x17 -S2N_BN_SYMBOL(bignum_montsqr_p384): +S2N_BN_SYMBOL(bignum_montsqr_p384_base): // Load in all words of the input diff --git a/third_party/s2n-bignum/arm/p384/unopt/p384_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/p384_montjadd.S similarity index 96% rename from third_party/s2n-bignum/arm/p384/unopt/p384_montjadd.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/p384_montjadd.S index cbd6f3cf003..7c7e1545fe1 100644 --- a/third_party/s2n-bignum/arm/p384/unopt/p384_montjadd.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/p384_montjadd.S @@ -73,7 +73,7 @@ #define NSPACE (NUMSIZE*7) -// Corresponds to bignum_montmul_p384_neon, with callee-save register spills +// Corresponds to bignum_montmul_p384, with callee-save register spills // rewritten to update sp in advance .montmul_p384: @@ -807,33 +807,33 @@ // Corresponds exactly to bignum_sub_p384 .sub_p384: - ldp x5, x6, [x1] - ldp x4, x3, [x2] - subs x5, x5, x4 - sbcs x6, x6, x3 - ldp x7, x8, [x1, #16] - ldp x4, x3, [x2, #16] - sbcs x7, x7, x4 - sbcs x8, x8, x3 - ldp x9, x10, [x1, #32] - ldp x4, x3, [x2, #32] - sbcs x9, x9, x4 - sbcs x10, x10, x3 - csetm x3, cc - mov x4, #0xffffffff - and x4, x4, x3 - adds x5, x5, x4 - eor x4, x4, x3 - adcs x6, x6, x4 - mov x4, #0xfffffffffffffffe - and x4, x4, x3 - adcs x7, x7, x4 - adcs x8, x8, x3 - adcs x9, x9, x3 - adc x10, x10, x3 - stp x5, x6, [x0] - stp x7, x8, [x0, #16] - stp x9, x10, [x0, #32] + ldp x5, x6, [x1] + ldp x4, x3, [x2] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [x2, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x1, #32] + ldp x4, x3, [x2, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + csetm x3, cc + mov x4, #0xffffffff + and x4, x4, x3 + adds x5, x5, x4 + eor x4, x4, x3 + adcs x6, x6, x4 + mov x4, #0xfffffffffffffffe + and x4, x4, x3 + adcs x7, x7, x4 + adcs x8, x8, x3 + adcs x9, x9, x3 + adc x10, x10, x3 + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + stp x9, x10, [x0, #32] ret diff --git a/third_party/s2n-bignum/arm/p384/unopt/p384_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/p384_montjdouble.S similarity index 62% rename from third_party/s2n-bignum/arm/p384/unopt/p384_montjdouble.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/p384_montjdouble.S index 4cdeeb86997..214fd2a6d91 100644 --- a/third_party/s2n-bignum/arm/p384/unopt/p384_montjdouble.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p384/unopt/p384_montjdouble.S @@ -58,7 +58,7 @@ #define NSPACE #(NUMSIZE*7) -// Corresponds exactly to bignum_montmul_p384_neon +// Corresponds exactly to bignum_montmul_p384 .montmul_p384: sub sp, sp, 48 @@ -791,76 +791,76 @@ // Corresponds exactly to bignum_sub_p384 .sub_p384: - ldp x5, x6, [x1] - ldp x4, x3, [x2] - subs x5, x5, x4 - sbcs x6, x6, x3 - ldp x7, x8, [x1, #16] - ldp x4, x3, [x2, #16] - sbcs x7, x7, x4 - sbcs x8, x8, x3 - ldp x9, x10, [x1, #32] - ldp x4, x3, [x2, #32] - sbcs x9, x9, x4 - sbcs x10, x10, x3 - csetm x3, cc - mov x4, #0xffffffff - and x4, x4, x3 - adds x5, x5, x4 - eor x4, x4, x3 - adcs x6, x6, x4 - mov x4, #0xfffffffffffffffe - and x4, x4, x3 - adcs x7, x7, x4 - adcs x8, x8, x3 - adcs x9, x9, x3 - adc x10, x10, x3 - stp x5, x6, [x0] - stp x7, x8, [x0, #16] - stp x9, x10, [x0, #32] + ldp x5, x6, [x1] + ldp x4, x3, [x2] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [x2, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x1, #32] + ldp x4, x3, [x2, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + csetm x3, cc + mov x4, #0xffffffff + and x4, x4, x3 + adds x5, x5, x4 + eor x4, x4, x3 + adcs x6, x6, x4 + mov x4, #0xfffffffffffffffe + and x4, x4, x3 + adcs x7, x7, x4 + adcs x8, x8, x3 + adcs x9, x9, x3 + adc x10, x10, x3 + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + stp x9, x10, [x0, #32] ret // Corresponds exactly to bignum_add_p384 .add_p384: - ldp x5, x6, [x1] - ldp x4, x3, [x2] - adds x5, x5, x4 - adcs x6, x6, x3 - ldp x7, x8, [x1, #16] - ldp x4, x3, [x2, #16] - adcs x7, x7, x4 - adcs x8, x8, x3 - ldp x9, x10, [x1, #32] - ldp x4, x3, [x2, #32] - adcs x9, x9, x4 - adcs x10, x10, x3 - adc x3, xzr, xzr - mov x4, #0xffffffff - cmp x5, x4 - mov x4, #0xffffffff00000000 - sbcs xzr, x6, x4 - mov x4, #0xfffffffffffffffe - sbcs xzr, x7, x4 - adcs xzr, x8, xzr - adcs xzr, x9, xzr - adcs xzr, x10, xzr - adcs x3, x3, xzr - csetm x3, ne - mov x4, #0xffffffff - and x4, x4, x3 - subs x5, x5, x4 - eor x4, x4, x3 - sbcs x6, x6, x4 - mov x4, #0xfffffffffffffffe - and x4, x4, x3 - sbcs x7, x7, x4 - sbcs x8, x8, x3 - sbcs x9, x9, x3 - sbc x10, x10, x3 - stp x5, x6, [x0] - stp x7, x8, [x0, #16] - stp x9, x10, [x0, #32] + ldp x5, x6, [x1] + ldp x4, x3, [x2] + adds x5, x5, x4 + adcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [x2, #16] + adcs x7, x7, x4 + adcs x8, x8, x3 + ldp x9, x10, [x1, #32] + ldp x4, x3, [x2, #32] + adcs x9, x9, x4 + adcs x10, x10, x3 + adc x3, xzr, xzr + mov x4, #0xffffffff + cmp x5, x4 + mov x4, #0xffffffff00000000 + sbcs xzr, x6, x4 + mov x4, #0xfffffffffffffffe + sbcs xzr, x7, x4 + adcs xzr, x8, xzr + adcs xzr, x9, xzr + adcs xzr, x10, xzr + adcs x3, x3, xzr + csetm x3, ne + mov x4, #0xffffffff + and x4, x4, x3 + subs x5, x5, x4 + eor x4, x4, x3 + sbcs x6, x6, x4 + mov x4, #0xfffffffffffffffe + and x4, x4, x3 + sbcs x7, x7, x4 + sbcs x8, x8, x3 + sbcs x9, x9, x3 + sbc x10, x10, x3 + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + stp x9, x10, [x0, #32] ret @@ -891,248 +891,248 @@ // P0 = 4 * P1 - P2 #define cmsub41_p384(P0,P1,P2) \ - ldp x1, x2, [P1]; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P1+32]; \ - lsl x0, x1, #2; \ - ldp x7, x8, [P2]; \ - subs x0, x0, x7; \ - extr x1, x2, x1, #62; \ - sbcs x1, x1, x8; \ - ldp x7, x8, [P2+16]; \ - extr x2, x3, x2, #62; \ - sbcs x2, x2, x7; \ - extr x3, x4, x3, #62; \ - sbcs x3, x3, x8; \ - extr x4, x5, x4, #62; \ - ldp x7, x8, [P2+32]; \ - sbcs x4, x4, x7; \ - extr x5, x6, x5, #62; \ - sbcs x5, x5, x8; \ - lsr x6, x6, #62; \ - adc x6, x6, xzr; \ - lsl x7, x6, #32; \ - subs x8, x6, x7; \ - sbc x7, x7, xzr; \ - adds x0, x0, x8; \ - adcs x1, x1, x7; \ - adcs x2, x2, x6; \ - adcs x3, x3, xzr; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - csetm x8, cc; \ - mov x9, #0xffffffff; \ - and x9, x9, x8; \ - adds x0, x0, x9; \ - eor x9, x9, x8; \ - adcs x1, x1, x9; \ - mov x9, #0xfffffffffffffffe; \ - and x9, x9, x8; \ - adcs x2, x2, x9; \ - adcs x3, x3, x8; \ - adcs x4, x4, x8; \ - adc x5, x5, x8; \ - stp x0, x1, [P0]; \ - stp x2, x3, [P0+16]; \ + ldp x1, x2, [P1] __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P1+32] __LF \ + lsl x0, x1, #2 __LF \ + ldp x7, x8, [P2] __LF \ + subs x0, x0, x7 __LF \ + extr x1, x2, x1, #62 __LF \ + sbcs x1, x1, x8 __LF \ + ldp x7, x8, [P2+16] __LF \ + extr x2, x3, x2, #62 __LF \ + sbcs x2, x2, x7 __LF \ + extr x3, x4, x3, #62 __LF \ + sbcs x3, x3, x8 __LF \ + extr x4, x5, x4, #62 __LF \ + ldp x7, x8, [P2+32] __LF \ + sbcs x4, x4, x7 __LF \ + extr x5, x6, x5, #62 __LF \ + sbcs x5, x5, x8 __LF \ + lsr x6, x6, #62 __LF \ + adc x6, x6, xzr __LF \ + lsl x7, x6, #32 __LF \ + subs x8, x6, x7 __LF \ + sbc x7, x7, xzr __LF \ + adds x0, x0, x8 __LF \ + adcs x1, x1, x7 __LF \ + adcs x2, x2, x6 __LF \ + adcs x3, x3, xzr __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + csetm x8, cc __LF \ + mov x9, #0xffffffff __LF \ + and x9, x9, x8 __LF \ + adds x0, x0, x9 __LF \ + eor x9, x9, x8 __LF \ + adcs x1, x1, x9 __LF \ + mov x9, #0xfffffffffffffffe __LF \ + and x9, x9, x8 __LF \ + adcs x2, x2, x9 __LF \ + adcs x3, x3, x8 __LF \ + adcs x4, x4, x8 __LF \ + adc x5, x5, x8 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] __LF \ stp x4, x5, [P0+32] // P0 = C * P1 - D * P2 #define cmsub_p384(P0,C,P1,D,P2) \ - ldp x0, x1, [P2]; \ - mov x6, #0x00000000ffffffff; \ - subs x6, x6, x0; \ - mov x7, #0xffffffff00000000; \ - sbcs x7, x7, x1; \ - ldp x0, x1, [P2+16]; \ - mov x8, #0xfffffffffffffffe; \ - sbcs x8, x8, x0; \ - mov x13, #0xffffffffffffffff; \ - sbcs x9, x13, x1; \ - ldp x0, x1, [P2+32]; \ - sbcs x10, x13, x0; \ - sbc x11, x13, x1; \ - mov x12, D; \ - mul x0, x12, x6; \ - mul x1, x12, x7; \ - mul x2, x12, x8; \ - mul x3, x12, x9; \ - mul x4, x12, x10; \ - mul x5, x12, x11; \ - umulh x6, x12, x6; \ - umulh x7, x12, x7; \ - umulh x8, x12, x8; \ - umulh x9, x12, x9; \ - umulh x10, x12, x10; \ - umulh x12, x12, x11; \ - adds x1, x1, x6; \ - adcs x2, x2, x7; \ - adcs x3, x3, x8; \ - adcs x4, x4, x9; \ - adcs x5, x5, x10; \ - mov x6, #1; \ - adc x6, x12, x6; \ - ldp x8, x9, [P1]; \ - ldp x10, x11, [P1+16]; \ - ldp x12, x13, [P1+32]; \ - mov x14, C; \ - mul x15, x14, x8; \ - umulh x8, x14, x8; \ - adds x0, x0, x15; \ - mul x15, x14, x9; \ - umulh x9, x14, x9; \ - adcs x1, x1, x15; \ - mul x15, x14, x10; \ - umulh x10, x14, x10; \ - adcs x2, x2, x15; \ - mul x15, x14, x11; \ - umulh x11, x14, x11; \ - adcs x3, x3, x15; \ - mul x15, x14, x12; \ - umulh x12, x14, x12; \ - adcs x4, x4, x15; \ - mul x15, x14, x13; \ - umulh x13, x14, x13; \ - adcs x5, x5, x15; \ - adc x6, x6, xzr; \ - adds x1, x1, x8; \ - adcs x2, x2, x9; \ - adcs x3, x3, x10; \ - adcs x4, x4, x11; \ - adcs x5, x5, x12; \ - adcs x6, x6, x13; \ - lsl x7, x6, #32; \ - subs x8, x6, x7; \ - sbc x7, x7, xzr; \ - adds x0, x0, x8; \ - adcs x1, x1, x7; \ - adcs x2, x2, x6; \ - adcs x3, x3, xzr; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - csetm x6, cc; \ - mov x7, #0xffffffff; \ - and x7, x7, x6; \ - adds x0, x0, x7; \ - eor x7, x7, x6; \ - adcs x1, x1, x7; \ - mov x7, #0xfffffffffffffffe; \ - and x7, x7, x6; \ - adcs x2, x2, x7; \ - adcs x3, x3, x6; \ - adcs x4, x4, x6; \ - adc x5, x5, x6; \ - stp x0, x1, [P0]; \ - stp x2, x3, [P0+16]; \ + ldp x0, x1, [P2] __LF \ + mov x6, #0x00000000ffffffff __LF \ + subs x6, x6, x0 __LF \ + mov x7, #0xffffffff00000000 __LF \ + sbcs x7, x7, x1 __LF \ + ldp x0, x1, [P2+16] __LF \ + mov x8, #0xfffffffffffffffe __LF \ + sbcs x8, x8, x0 __LF \ + mov x13, #0xffffffffffffffff __LF \ + sbcs x9, x13, x1 __LF \ + ldp x0, x1, [P2+32] __LF \ + sbcs x10, x13, x0 __LF \ + sbc x11, x13, x1 __LF \ + mov x12, D __LF \ + mul x0, x12, x6 __LF \ + mul x1, x12, x7 __LF \ + mul x2, x12, x8 __LF \ + mul x3, x12, x9 __LF \ + mul x4, x12, x10 __LF \ + mul x5, x12, x11 __LF \ + umulh x6, x12, x6 __LF \ + umulh x7, x12, x7 __LF \ + umulh x8, x12, x8 __LF \ + umulh x9, x12, x9 __LF \ + umulh x10, x12, x10 __LF \ + umulh x12, x12, x11 __LF \ + adds x1, x1, x6 __LF \ + adcs x2, x2, x7 __LF \ + adcs x3, x3, x8 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + mov x6, #1 __LF \ + adc x6, x12, x6 __LF \ + ldp x8, x9, [P1] __LF \ + ldp x10, x11, [P1+16] __LF \ + ldp x12, x13, [P1+32] __LF \ + mov x14, C __LF \ + mul x15, x14, x8 __LF \ + umulh x8, x14, x8 __LF \ + adds x0, x0, x15 __LF \ + mul x15, x14, x9 __LF \ + umulh x9, x14, x9 __LF \ + adcs x1, x1, x15 __LF \ + mul x15, x14, x10 __LF \ + umulh x10, x14, x10 __LF \ + adcs x2, x2, x15 __LF \ + mul x15, x14, x11 __LF \ + umulh x11, x14, x11 __LF \ + adcs x3, x3, x15 __LF \ + mul x15, x14, x12 __LF \ + umulh x12, x14, x12 __LF \ + adcs x4, x4, x15 __LF \ + mul x15, x14, x13 __LF \ + umulh x13, x14, x13 __LF \ + adcs x5, x5, x15 __LF \ + adc x6, x6, xzr __LF \ + adds x1, x1, x8 __LF \ + adcs x2, x2, x9 __LF \ + adcs x3, x3, x10 __LF \ + adcs x4, x4, x11 __LF \ + adcs x5, x5, x12 __LF \ + adcs x6, x6, x13 __LF \ + lsl x7, x6, #32 __LF \ + subs x8, x6, x7 __LF \ + sbc x7, x7, xzr __LF \ + adds x0, x0, x8 __LF \ + adcs x1, x1, x7 __LF \ + adcs x2, x2, x6 __LF \ + adcs x3, x3, xzr __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + csetm x6, cc __LF \ + mov x7, #0xffffffff __LF \ + and x7, x7, x6 __LF \ + adds x0, x0, x7 __LF \ + eor x7, x7, x6 __LF \ + adcs x1, x1, x7 __LF \ + mov x7, #0xfffffffffffffffe __LF \ + and x7, x7, x6 __LF \ + adcs x2, x2, x7 __LF \ + adcs x3, x3, x6 __LF \ + adcs x4, x4, x6 __LF \ + adc x5, x5, x6 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] __LF \ stp x4, x5, [P0+32] // A weak version of add that only guarantees sum in 6 digits #define weakadd_p384(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - adds x5, x5, x4; \ - adcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - adcs x7, x7, x4; \ - adcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - adcs x9, x9, x4; \ - adcs x10, x10, x3; \ - csetm x3, cs; \ - mov x4, #0xffffffff; \ - and x4, x4, x3; \ - subs x5, x5, x4; \ - eor x4, x4, x3; \ - sbcs x6, x6, x4; \ - mov x4, #0xfffffffffffffffe; \ - and x4, x4, x3; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - sbcs x9, x9, x3; \ - sbc x10, x10, x3; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + adds x5, x5, x4 __LF \ + adcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + adcs x7, x7, x4 __LF \ + adcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x3 __LF \ + csetm x3, cs __LF \ + mov x4, #0xffffffff __LF \ + and x4, x4, x3 __LF \ + subs x5, x5, x4 __LF \ + eor x4, x4, x3 __LF \ + sbcs x6, x6, x4 __LF \ + mov x4, #0xfffffffffffffffe __LF \ + and x4, x4, x3 __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + sbcs x9, x9, x3 __LF \ + sbc x10, x10, x3 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ stp x9, x10, [P0+32] // P0 = 3 * P1 - 8 * P2 #define cmsub38_p384(P0,P1,P2) \ - ldp x0, x1, [P2]; \ - mov x6, #0x00000000ffffffff; \ - subs x6, x6, x0; \ - mov x7, #0xffffffff00000000; \ - sbcs x7, x7, x1; \ - ldp x0, x1, [P2+16]; \ - mov x8, #0xfffffffffffffffe; \ - sbcs x8, x8, x0; \ - mov x13, #0xffffffffffffffff; \ - sbcs x9, x13, x1; \ - ldp x0, x1, [P2+32]; \ - sbcs x10, x13, x0; \ - sbc x11, x13, x1; \ - lsl x0, x6, #3; \ - extr x1, x7, x6, #61; \ - extr x2, x8, x7, #61; \ - extr x3, x9, x8, #61; \ - extr x4, x10, x9, #61; \ - extr x5, x11, x10, #61; \ - lsr x6, x11, #61; \ - add x6, x6, #1; \ - ldp x8, x9, [P1]; \ - ldp x10, x11, [P1+16]; \ - ldp x12, x13, [P1+32]; \ - mov x14, 3; \ - mul x15, x14, x8; \ - umulh x8, x14, x8; \ - adds x0, x0, x15; \ - mul x15, x14, x9; \ - umulh x9, x14, x9; \ - adcs x1, x1, x15; \ - mul x15, x14, x10; \ - umulh x10, x14, x10; \ - adcs x2, x2, x15; \ - mul x15, x14, x11; \ - umulh x11, x14, x11; \ - adcs x3, x3, x15; \ - mul x15, x14, x12; \ - umulh x12, x14, x12; \ - adcs x4, x4, x15; \ - mul x15, x14, x13; \ - umulh x13, x14, x13; \ - adcs x5, x5, x15; \ - adc x6, x6, xzr; \ - adds x1, x1, x8; \ - adcs x2, x2, x9; \ - adcs x3, x3, x10; \ - adcs x4, x4, x11; \ - adcs x5, x5, x12; \ - adcs x6, x6, x13; \ - lsl x7, x6, #32; \ - subs x8, x6, x7; \ - sbc x7, x7, xzr; \ - adds x0, x0, x8; \ - adcs x1, x1, x7; \ - adcs x2, x2, x6; \ - adcs x3, x3, xzr; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - csetm x6, cc; \ - mov x7, #0xffffffff; \ - and x7, x7, x6; \ - adds x0, x0, x7; \ - eor x7, x7, x6; \ - adcs x1, x1, x7; \ - mov x7, #0xfffffffffffffffe; \ - and x7, x7, x6; \ - adcs x2, x2, x7; \ - adcs x3, x3, x6; \ - adcs x4, x4, x6; \ - adc x5, x5, x6; \ - stp x0, x1, [P0]; \ - stp x2, x3, [P0+16]; \ + ldp x0, x1, [P2] __LF \ + mov x6, #0x00000000ffffffff __LF \ + subs x6, x6, x0 __LF \ + mov x7, #0xffffffff00000000 __LF \ + sbcs x7, x7, x1 __LF \ + ldp x0, x1, [P2+16] __LF \ + mov x8, #0xfffffffffffffffe __LF \ + sbcs x8, x8, x0 __LF \ + mov x13, #0xffffffffffffffff __LF \ + sbcs x9, x13, x1 __LF \ + ldp x0, x1, [P2+32] __LF \ + sbcs x10, x13, x0 __LF \ + sbc x11, x13, x1 __LF \ + lsl x0, x6, #3 __LF \ + extr x1, x7, x6, #61 __LF \ + extr x2, x8, x7, #61 __LF \ + extr x3, x9, x8, #61 __LF \ + extr x4, x10, x9, #61 __LF \ + extr x5, x11, x10, #61 __LF \ + lsr x6, x11, #61 __LF \ + add x6, x6, #1 __LF \ + ldp x8, x9, [P1] __LF \ + ldp x10, x11, [P1+16] __LF \ + ldp x12, x13, [P1+32] __LF \ + mov x14, 3 __LF \ + mul x15, x14, x8 __LF \ + umulh x8, x14, x8 __LF \ + adds x0, x0, x15 __LF \ + mul x15, x14, x9 __LF \ + umulh x9, x14, x9 __LF \ + adcs x1, x1, x15 __LF \ + mul x15, x14, x10 __LF \ + umulh x10, x14, x10 __LF \ + adcs x2, x2, x15 __LF \ + mul x15, x14, x11 __LF \ + umulh x11, x14, x11 __LF \ + adcs x3, x3, x15 __LF \ + mul x15, x14, x12 __LF \ + umulh x12, x14, x12 __LF \ + adcs x4, x4, x15 __LF \ + mul x15, x14, x13 __LF \ + umulh x13, x14, x13 __LF \ + adcs x5, x5, x15 __LF \ + adc x6, x6, xzr __LF \ + adds x1, x1, x8 __LF \ + adcs x2, x2, x9 __LF \ + adcs x3, x3, x10 __LF \ + adcs x4, x4, x11 __LF \ + adcs x5, x5, x12 __LF \ + adcs x6, x6, x13 __LF \ + lsl x7, x6, #32 __LF \ + subs x8, x6, x7 __LF \ + sbc x7, x7, xzr __LF \ + adds x0, x0, x8 __LF \ + adcs x1, x1, x7 __LF \ + adcs x2, x2, x6 __LF \ + adcs x3, x3, xzr __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + csetm x6, cc __LF \ + mov x7, #0xffffffff __LF \ + and x7, x7, x6 __LF \ + adds x0, x0, x7 __LF \ + eor x7, x7, x6 __LF \ + adcs x1, x1, x7 __LF \ + mov x7, #0xfffffffffffffffe __LF \ + and x7, x7, x6 __LF \ + adcs x2, x2, x7 __LF \ + adcs x3, x3, x6 __LF \ + adcs x4, x4, x6 __LF \ + adc x5, x5, x6 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] __LF \ stp x4, x5, [P0+32] S2N_BN_SYMBOL(p384_montjdouble): diff --git a/third_party/s2n-bignum/arm/p521/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/Makefile similarity index 93% rename from third_party/s2n-bignum/arm/p521/Makefile rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/Makefile index 3936b48307c..620ff871d4f 100644 --- a/third_party/s2n-bignum/arm/p521/Makefile +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/Makefile @@ -33,18 +33,14 @@ OBJ = bignum_add_p521.o \ bignum_mod_p521_9.o \ bignum_montmul_p521.o \ bignum_montmul_p521_alt.o \ - bignum_montmul_p521_neon.o \ bignum_montsqr_p521.o \ bignum_montsqr_p521_alt.o \ - bignum_montsqr_p521_neon.o \ bignum_mul_p521.o \ bignum_mul_p521_alt.o \ - bignum_mul_p521_neon.o \ bignum_neg_p521.o \ bignum_optneg_p521.o \ bignum_sqr_p521.o \ bignum_sqr_p521_alt.o \ - bignum_sqr_p521_neon.o \ bignum_sub_p521.o \ bignum_tolebytes_p521.o \ bignum_tomont_p521.o \ diff --git a/third_party/s2n-bignum/arm/p521/bignum_add_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_add_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_add_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_add_p521.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_cmul_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_cmul_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_cmul_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_cmul_p521.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_deamont_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_deamont_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_deamont_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_deamont_p521.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_demont_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_demont_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_demont_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_demont_p521.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_double_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_double_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_double_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_double_p521.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_fromlebytes_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_fromlebytes_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_fromlebytes_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_fromlebytes_p521.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_half_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_half_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_half_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_half_p521.S diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_inv_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_inv_p521.S new file mode 100644 index 00000000000..2962fc2106f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_inv_p521.S @@ -0,0 +1,1696 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular inverse modulo p_521 = 2^521 - 1 +// Input x[9]; output z[9] +// +// extern void bignum_inv_p521(uint64_t z[static 9],uint64_t x[static 9]); +// +// Assuming the 9-digit input x is coprime to p_521, i.e. is not divisible +// by it, returns z < p_521 such that x * z == 1 (mod p_521). Note that +// x does not need to be reduced modulo p_521, but the output always is. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p521) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p521) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Used for the return pointer + +#define res x20 + +// Loop counter and d = 2 * delta value for divstep + +#define i x21 +#define d x22 + +// Registers used for matrix element magnitudes and signs + +#define m00 x10 +#define m01 x11 +#define m10 x12 +#define m11 x13 +#define s00 x14 +#define s01 x15 +#define s10 x16 +#define s11 x17 + +// Initial carries for combinations + +#define car0 x9 +#define car1 x19 + +// Input and output, plain registers treated according to pattern + +#define reg0 x0, #0 +#define reg1 x1, #0 +#define reg2 x2, #0 +#define reg3 x3, #0 +#define reg4 x4, #0 + +#define x x1, #0 +#define z x0, #0 + +// Pointer-offset pairs for temporaries on stack + +#define f sp, #0 +#define g sp, #(9*N) +#define u sp, #(18*N) +#define v sp, #(27*N) + +// Total size to reserve on the stack + +#define NSPACE #(36*N) + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix in +// registers as follows +// +// [ m00 m01] +// [ m10 m11] + +#define divstep59() \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x8, x4, #0x100, lsl #12 __LF \ + sbfx x8, x8, #21, #21 __LF \ + mov x11, #0x100000 __LF \ + add x11, x11, x11, lsl #21 __LF \ + add x9, x4, x11 __LF \ + asr x9, x9, #42 __LF \ + add x10, x5, #0x100, lsl #12 __LF \ + sbfx x10, x10, #21, #21 __LF \ + add x11, x5, x11 __LF \ + asr x11, x11, #42 __LF \ + mul x6, x8, x2 __LF \ + mul x7, x9, x3 __LF \ + mul x2, x10, x2 __LF \ + mul x3, x11, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #21, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #42 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #21, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #42 __LF \ + mul x6, x12, x2 __LF \ + mul x7, x13, x3 __LF \ + mul x2, x14, x2 __LF \ + mul x3, x15, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + mul x2, x12, x8 __LF \ + mul x3, x12, x9 __LF \ + mul x6, x14, x8 __LF \ + mul x7, x14, x9 __LF \ + madd x8, x13, x10, x2 __LF \ + madd x9, x13, x11, x3 __LF \ + madd x16, x15, x10, x6 __LF \ + madd x17, x15, x11, x7 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #22, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #43 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #22, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #43 __LF \ + mneg x2, x12, x8 __LF \ + mneg x3, x12, x9 __LF \ + mneg x4, x14, x8 __LF \ + mneg x5, x14, x9 __LF \ + msub m00, x13, x16, x2 __LF \ + msub m01, x13, x17, x3 __LF \ + msub m10, x15, x16, x4 __LF \ + msub m11, x15, x17, x5 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(bignum_inv_p521): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Copy the prime p_521 = 2^521 - 1 into the f variable + + mov x10, #0xFFFFFFFFFFFFFFFF + stp x10, x10, [f] + stp x10, x10, [f+16] + stp x10, x10, [f+32] + stp x10, x10, [f+48] + mov x11, #0x1FF + str x11, [f+64] + +// Copy the input into the g variable, but reduce it strictly mod p_521 +// so that g <= f as assumed in the bound proof. This code fragment is +// very similar to bignum_mod_p521_9 complete with carry condensation. + + ldr x8, [x1, #64] + lsr x9, x8, #9 + + subs xzr, xzr, xzr + ldp x10, x11, [x1] + adcs xzr, x10, x9 + adcs xzr, x11, xzr + ldp x12, x13, [x1, #16] + and x7, x12, x13 + adcs xzr, x7, xzr + ldp x14, x15, [x1, #32] + and x7, x14, x15 + adcs xzr, x7, xzr + ldp x16, x17, [x1, #48] + and x7, x16, x17 + adcs xzr, x7, xzr + orr x7, x8, #~0x1FF + adcs x7, x7, xzr + + adcs x10, x10, x9 + adcs x11, x11, xzr + adcs x12, x12, xzr + adcs x13, x13, xzr + adcs x14, x14, xzr + adcs x15, x15, xzr + adcs x16, x16, xzr + adcs x17, x17, xzr + adc x8, x8, xzr + and x8, x8, #0x1FF + + stp x10, x11, [g] + stp x12, x13, [g+16] + stp x14, x15, [g+32] + stp x16, x17, [g+48] + str x8, [g+64] + +// Also maintain weakly reduced < 2*p_521 vector [u,v] such that +// [f,g] == x * 2^{1239-59*i} * [u,v] (mod p_521) +// starting with [p_521,x] == x * 2^{1239-59*0} * [0,2^-1239] (mod p_521) +// Note that because (2^{a+521} == 2^a) (mod p_521) we simply have +// (2^-1239 == 2^324) (mod p_521) so the constant initializer is simple. +// +// Based on the standard divstep bound, for inputs <= 2^b we need at least +// n >= (9437 * b + 1) / 4096. Since b is 521, that means 1201 iterations. +// Since we package divstep in multiples of 59 bits, we do 21 blocks of 59 +// making *1239* total. (With a bit more effort we could avoid the full 59 +// divsteps and use a shorter tail computation, but we keep it simple.) +// Hence, after the 21st iteration we have [f,g] == x * [u,v] and since +// |f| = 1 we get the modular inverse from u by flipping its sign with f. + + stp xzr, xzr, [u] + stp xzr, xzr, [u+16] + stp xzr, xzr, [u+32] + stp xzr, xzr, [u+48] + str xzr, [u+64] + + mov x10, #16 + stp xzr, xzr, [v] + stp xzr, xzr, [v+16] + stp xzr, x10, [v+32] + stp xzr, xzr, [v+48] + str xzr, [v+64] + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special 21st iteration after a uniform +// first 20. + + mov i, #21 + mov d, #1 + b bignum_inv_p521_midloop + +bignum_inv_p521_loop: + +// Separate the matrix elements into sign-magnitude pairs + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in stable registers for the [u,v] part and do [f,g] first. + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + + and x0, m10, s10 + and x1, m11, s11 + add car1, x0, x1 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + ldr x7, [f] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [g] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + +// Digit 1 of [f,g] + + ldr x7, [f+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g] + +// Digit 2 of [f,g] + + ldr x7, [f+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+N] + +// Digit 3 of [f,g] + + ldr x7, [f+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [g+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+2*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x6, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [g+2*N] + +// Digit 4 of [f,g] + + ldr x7, [f+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [g+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [f+3*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x6, x6, x0 + adc x5, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [g+3*N] + +// Digit 5 of [f,g] + + ldr x7, [f+5*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, xzr, x1 + ldr x8, [g+5*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [f+4*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, x5, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [g+4*N] + +// Digit 6 of [f,g] + + ldr x7, [f+6*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+6*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f+5*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g+5*N] + +// Digit 7 of [f,g] + + ldr x7, [f+7*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+7*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+6*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+6*N] + +// Digits 8 and 9 of [f,g] + + ldr x7, [f+8*N] + eor x1, x7, s00 + asr x3, x1, #63 + and x3, x3, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [g+8*N] + eor x1, x8, s01 + asr x0, x1, #63 + and x0, x0, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+7*N] + extr x5, x3, x5, #59 + str x5, [f+8*N] + + eor x1, x7, s10 + asr x5, x1, #63 + and x5, x5, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + asr x0, x1, #63 + and x0, x0, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [g+7*N] + extr x2, x5, x2, #59 + str x2, [g+8*N] + +// Now the computation of the updated u and v values and their +// modular reductions. A very similar accumulation except that +// the top words of u and v are unsigned and we don't shift. +// +// Digit 0 of [u,v] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v] + adc x3, x3, x1 + +// Digit 1 of [u,v] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+N] + adc x4, x4, x1 + +// Digit 2 of [u,v] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+2*N] + adc x2, x2, x1 + +// Digit 3 of [u,v] + + ldr x7, [u+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + str x5, [u+3*N] + adc x3, x3, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x6, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + str x2, [v+3*N] + adc x6, x6, x1 + +// Digit 4 of [u,v] + + ldr x7, [u+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [v+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + str x3, [u+4*N] + adc x4, x4, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x6, x6, x0 + adc x5, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x6, x6, x0 + str x6, [v+4*N] + adc x5, x5, x1 + +// Digit 5 of [u,v] + + ldr x7, [u+5*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, xzr, x1 + ldr x8, [v+5*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u+5*N] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, x5, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v+5*N] + adc x3, x3, x1 + +// Digit 6 of [u,v] + + ldr x7, [u+6*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+6*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+6*N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+6*N] + adc x4, x4, x1 + +// Digit 7 of [u,v] + + ldr x7, [u+7*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+7*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+7*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+7*N] + adc x2, x2, x1 + +// Digits 8 and 9 of u (top is unsigned) + + ldr x7, [u+8*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+8*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Modular reduction of u, reloading as needed from u[0],...,u[7],x5,x3 + + extr x6, x3, x5, #9 + ldp x0, x1, [u] + add x6, x6, x3, asr #63 + sub x5, x5, x6, lsl #9 + adds x0, x0, x6 + asr x6, x6, #63 + adcs x1, x1, x6 + stp x0, x1, [u] + ldp x0, x1, [u+16] + adcs x0, x0, x6 + adcs x1, x1, x6 + stp x0, x1, [u+16] + ldp x0, x1, [u+32] + adcs x0, x0, x6 + adcs x1, x1, x6 + stp x0, x1, [u+32] + ldp x0, x1, [u+48] + adcs x0, x0, x6 + adcs x1, x1, x6 + stp x0, x1, [u+48] + adc x5, x5, x6 + str x5, [u+64] + +// Digits 8 and 9 of v (top is unsigned) + + eor x1, x7, s10 + and x5, s10, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + and x0, s11, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + +// Modular reduction of v, reloading as needed from v[0],...,v[7],x2,x5 + + extr x6, x5, x2, #9 + ldp x0, x1, [v] + add x6, x6, x5, asr #63 + sub x2, x2, x6, lsl #9 + adds x0, x0, x6 + asr x6, x6, #63 + adcs x1, x1, x6 + stp x0, x1, [v] + ldp x0, x1, [v+16] + adcs x0, x0, x6 + adcs x1, x1, x6 + stp x0, x1, [v+16] + ldp x0, x1, [v+32] + adcs x0, x0, x6 + adcs x1, x1, x6 + stp x0, x1, [v+32] + ldp x0, x1, [v+48] + adcs x0, x0, x6 + adcs x1, x1, x6 + stp x0, x1, [v+48] + adc x2, x2, x6 + str x2, [v+64] + +bignum_inv_p521_midloop: + + mov x1, d + ldr x2, [f] + ldr x3, [g] + divstep59() + mov d, x1 + +// Next iteration + + subs i, i, #1 + bne bignum_inv_p521_loop + +// The 21st and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + ldr x0, [f] + ldr x1, [g] + mul x0, x0, m00 + madd x1, x1, m01, x0 + asr x0, x1, #63 + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * [u,v] (mod p_521) +// we want to flip the sign of u according to that of f. + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + eor s00, s00, x0 + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + eor s01, s01, x0 + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + eor s10, s10, x0 + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + eor s11, s11, x0 + +// Adjust the initial value to allow for complement instead of negation + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + +// Digit 0 of [u] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + +// Digit 1 of [u] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + +// Digit 2 of [u] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + +// Digit 3 of [u] + + ldr x7, [u+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + str x5, [u+3*N] + adc x3, x3, x1 + +// Digit 4 of [u] + + ldr x7, [u+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [v+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + str x3, [u+4*N] + adc x4, x4, x1 + +// Digit 5 of [u] + + ldr x7, [u+5*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, xzr, x1 + ldr x8, [v+5*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u+5*N] + adc x2, x2, x1 + +// Digit 6 of [u] + + ldr x7, [u+6*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+6*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+6*N] + adc x6, x6, x1 + +// Digit 7 of [u] + + ldr x7, [u+7*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+7*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+7*N] + adc x5, x5, x1 + +// Digits 8 and 9 of u (top is unsigned) + + ldr x7, [u+8*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+8*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Modular reduction of u, reloading as needed from u[0],...,u[7],x5,x3 + + extr x6, x3, x5, #9 + ldp x10, x11, [u] + add x6, x6, x3, asr #63 + sub x5, x5, x6, lsl #9 + adds x10, x10, x6 + asr x6, x6, #63 + adcs x11, x11, x6 + ldp x12, x13, [u+16] + adcs x12, x12, x6 + adcs x13, x13, x6 + ldp x14, x15, [u+32] + adcs x14, x14, x6 + adcs x15, x15, x6 + ldp x16, x17, [u+48] + adcs x16, x16, x6 + adcs x17, x17, x6 + adc x19, x5, x6 + +// Further strict reduction ready for the output, which just means +// a conditional subtraction of p_521 + + subs x0, x10, #-1 + adcs x1, x11, xzr + adcs x2, x12, xzr + adcs x3, x13, xzr + adcs x4, x14, xzr + adcs x5, x15, xzr + adcs x6, x16, xzr + adcs x7, x17, xzr + mov x8, #0x1FF + sbcs x8, x19, x8 + + csel x0, x0, x10, cs + csel x1, x1, x11, cs + csel x2, x2, x12, cs + csel x3, x3, x13, cs + csel x4, x4, x14, cs + csel x5, x5, x15, cs + csel x6, x6, x16, cs + csel x7, x7, x17, cs + csel x8, x8, x19, cs + +// Store it back to the final output + + stp x0, x1, [res] + stp x2, x3, [res, #16] + stp x4, x5, [res, #32] + stp x6, x7, [res, #48] + str x8, [res, #64] + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/bignum_mod_n521_9.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mod_n521_9.S similarity index 94% rename from third_party/s2n-bignum/arm/p521/bignum_mod_n521_9.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mod_n521_9.S index d680e5f1db1..6dec50317b5 100644 --- a/third_party/s2n-bignum/arm/p521/bignum_mod_n521_9.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mod_n521_9.S @@ -47,9 +47,9 @@ #define t d7 #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 S2N_BN_SYMBOL(bignum_mod_n521_9): diff --git a/third_party/s2n-bignum/arm/p521/bignum_mod_p521_9.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mod_p521_9.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_mod_p521_9.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mod_p521_9.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_montmul_p521_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montmul_p521.S similarity index 99% rename from third_party/s2n-bignum/arm/p521/bignum_montmul_p521_neon.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montmul_p521.S index 9586339f955..a88442df4a6 100644 --- a/third_party/s2n-bignum/arm/p521/bignum_montmul_p521_neon.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montmul_p521.S @@ -5,7 +5,7 @@ // Montgomery multiply, z := (x * y / 2^576) mod p_521 // Inputs x[9], y[9]; output z[9] // -// extern void bignum_montmul_p521_neon +// extern void bignum_montmul_p521 // (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); // // Does z := (x * y / 2^576) mod p_521, assuming x < p_521, y < p_521. This @@ -17,7 +17,8 @@ // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" -// bignum_montmul_p521_neon is functionally equivalent to bignum_montmul_p521. +// bignum_montmul_p521 is functionally equivalent to +// unopt/bignum_montmul_p521_base. // It is written in a way that // 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully // chosen and vectorized @@ -717,12 +718,12 @@ // # from this file since the sequence is non-deterministically chosen. // # Please add 'ret' at the end of the output assembly. - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p521_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p521_neon) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p521) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p521) .text .balign 4 -S2N_BN_SYMBOL(bignum_montmul_p521_neon): +S2N_BN_SYMBOL(bignum_montmul_p521): // Save registers and make space for the temporary buffer diff --git a/third_party/s2n-bignum/arm/p521/bignum_montmul_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montmul_p521_alt.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_montmul_p521_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montmul_p521_alt.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montsqr_p521.S similarity index 99% rename from third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_neon.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montsqr_p521.S index 57cf9116156..5d1dccfd539 100644 --- a/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_neon.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montsqr_p521.S @@ -5,7 +5,7 @@ // Montgomery square, z := (x^2 / 2^576) mod p_521 // Input x[9]; output z[9] // -// extern void bignum_montsqr_p521_neon +// extern void bignum_montsqr_p521 // (uint64_t z[static 9], uint64_t x[static 9]); // // Does z := (x^2 / 2^576) mod p_521, assuming x < p_521. This means the @@ -17,7 +17,8 @@ // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" -// bignum_montsqr_p521_neon is functionally equivalent to bignum_montsqr_p521. +// bignum_montsqr_p521 is functionally equivalent to +// unopt/bignum_montsqr_p521. // It is written in a way that // 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully // chosen and vectorized @@ -570,12 +571,12 @@ // # from this file since the sequence is non-deterministically chosen. // # Please add 'ret' at the end of the output assembly. - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p521_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p521_neon) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p521) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p521) .text .balign 4 -S2N_BN_SYMBOL(bignum_montsqr_p521_neon): +S2N_BN_SYMBOL(bignum_montsqr_p521): // Save registers diff --git a/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montsqr_p521_alt.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_montsqr_p521_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_montsqr_p521_alt.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_mul_p521_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mul_p521.S similarity index 99% rename from third_party/s2n-bignum/arm/p521/bignum_mul_p521_neon.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mul_p521.S index c9d34151d56..5eba505ba7e 100644 --- a/third_party/s2n-bignum/arm/p521/bignum_mul_p521_neon.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mul_p521.S @@ -5,14 +5,14 @@ // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced // Inputs x[9], y[9]; output z[9] // -// extern void bignum_mul_p521_neon +// extern void bignum_mul_p521 // (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); // // Standard ARM ABI: X0 = z, X1 = x, X2 = y // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" -// bignum_mul_p521_neon is functionally equivalent to bignum_mul_p521. +// bignum_mul_p521 is functionally equivalent to unopt/bignum_mul_p521_base. // It is written in a way that // 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully // chosen and vectorized @@ -708,12 +708,12 @@ // # from this file since the sequence is non-deterministically chosen. // # Please add 'ret' at the end of the output assembly. - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p521_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p521_neon) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p521) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p521) .text .balign 4 -S2N_BN_SYMBOL(bignum_mul_p521_neon): +S2N_BN_SYMBOL(bignum_mul_p521): // Save registers and make space for the temporary buffer diff --git a/third_party/s2n-bignum/arm/p521/bignum_mul_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mul_p521_alt.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_mul_p521_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_mul_p521_alt.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_neg_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_neg_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_neg_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_neg_p521.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_optneg_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_optneg_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_optneg_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_optneg_p521.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_sqr_p521_neon.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sqr_p521.S similarity index 99% rename from third_party/s2n-bignum/arm/p521/bignum_sqr_p521_neon.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sqr_p521.S index 13cd1c25419..6c9cac4d7c0 100644 --- a/third_party/s2n-bignum/arm/p521/bignum_sqr_p521_neon.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sqr_p521.S @@ -5,14 +5,14 @@ // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced // Input x[9]; output z[9] // -// extern void bignum_sqr_p521_neon (uint64_t z[static 9], +// extern void bignum_sqr_p521 (uint64_t z[static 9], // uint64_t x[static 9]); // // Standard ARM ABI: X0 = z, X1 = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" -// bignum_montsqr_p521_neon is functionally equivalent to bignum_montsqr_p521. +// bignum_sqr_p521 is functionally equivalent to unopt/bignum_sqr_p521_base. // It is written in a way that // 1. A subset of scalar multiplications in bignum_montmul_p384 are carefully // chosen and vectorized @@ -567,12 +567,12 @@ // # from this file since the sequence is non-deterministically chosen. // # Please add 'ret' at the end of the output assembly. - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p521_neon) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p521_neon) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p521) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p521) .text .balign 4 -S2N_BN_SYMBOL(bignum_sqr_p521_neon): +S2N_BN_SYMBOL(bignum_sqr_p521): // Save registers diff --git a/third_party/s2n-bignum/arm/p521/bignum_sqr_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sqr_p521_alt.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_sqr_p521_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sqr_p521_alt.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_sub_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sub_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_sub_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_sub_p521.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_tolebytes_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_tolebytes_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_tolebytes_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_tolebytes_p521.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_tomont_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_tomont_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_tomont_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_tomont_p521.S diff --git a/third_party/s2n-bignum/arm/p521/bignum_triple_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_triple_p521.S similarity index 100% rename from third_party/s2n-bignum/arm/p521/bignum_triple_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/bignum_triple_p521.S diff --git a/third_party/s2n-bignum/arm/p521/p521_jadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jadd.S similarity index 98% rename from third_party/s2n-bignum/arm/p521/p521_jadd.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jadd.S index 6dbcad2b7bd..de36c6566a5 100644 --- a/third_party/s2n-bignum/arm/p521/p521_jadd.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jadd.S @@ -84,21 +84,21 @@ // and bignum_sub_p521 #define mul_p521(P0,P1,P2) \ - add x0, P0; \ - add x1, P1; \ - add x2, P2; \ - bl local_mul_p521 + add x0, P0 __LF \ + add x1, P1 __LF \ + add x2, P2 __LF \ + bl p521_jadd_local_mul_p521 #define sqr_p521(P0,P1) \ - add x0, P0; \ - add x1, P1; \ - bl local_sqr_p521 + add x0, P0 __LF \ + add x1, P1 __LF \ + bl p521_jadd_local_sqr_p521 #define sub_p521(P0,P1,P2) \ - add x0, P0; \ - add x1, P1; \ - add x2, P2; \ - bl local_sub_p521 + add x0, P0 __LF \ + add x1, P1 __LF \ + add x2, P2 __LF \ + bl p521_jadd_local_sub_p521 S2N_BN_SYMBOL(p521_jadd): @@ -348,9 +348,9 @@ S2N_BN_SYMBOL(p521_jadd): ret // Local versions of the three field operations, identical to -// bignum_mul_p521_neon, bignum_sqr_p521_neon and bignum_sub_p521. +// bignum_mul_p521, bignum_sqr_p521 and bignum_sub_p521. -local_mul_p521: +p521_jadd_local_mul_p521: stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! @@ -1027,7 +1027,7 @@ local_mul_p521: ldp x19, x20, [sp], #16 ret -local_sqr_p521: +p521_jadd_local_sqr_p521: stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! @@ -1563,7 +1563,7 @@ local_sqr_p521: ldp x19, x20, [sp], #16 ret -local_sub_p521: +p521_jadd_local_sub_p521: ldp x5, x6, [x1] ldp x4, x3, [x2] subs x5, x5, x4 diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jadd_alt.S new file mode 100644 index 00000000000..da6166b8138 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jadd_alt.S @@ -0,0 +1,979 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jadd_alt +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 27]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input points p1 and p2 are +// fully reduced mod p_521, that both z coordinates are nonzero and +// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents +// the same affine point as". +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence + +#define input_z x26 +#define input_x x27 +#define input_y x28 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +// NUMSIZE*7 is not 16-aligned so we round it up + +#define NSPACE (NUMSIZE*7+8) + +// Corresponds exactly to bignum_mul_p521_alt + +#define mul_p521(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + mul x15, x3, x5 __LF \ + umulh x16, x3, x5 __LF \ + mul x14, x3, x6 __LF \ + umulh x17, x3, x6 __LF \ + adds x16, x16, x14 __LF \ + ldp x7, x8, [P2+16] __LF \ + mul x14, x3, x7 __LF \ + umulh x19, x3, x7 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x3, x8 __LF \ + umulh x20, x3, x8 __LF \ + adcs x19, x19, x14 __LF \ + ldp x9, x10, [P2+32] __LF \ + mul x14, x3, x9 __LF \ + umulh x21, x3, x9 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x3, x10 __LF \ + umulh x22, x3, x10 __LF \ + adcs x21, x21, x14 __LF \ + ldp x11, x12, [P2+48] __LF \ + mul x14, x3, x11 __LF \ + umulh x23, x3, x11 __LF \ + adcs x22, x22, x14 __LF \ + ldr x13, [P2+64] __LF \ + mul x14, x3, x12 __LF \ + umulh x24, x3, x12 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x13 __LF \ + umulh x1, x3, x13 __LF \ + adcs x24, x24, x14 __LF \ + adc x1, x1, xzr __LF \ + mul x14, x4, x5 __LF \ + adds x16, x16, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x1, x1, x14 __LF \ + cset x0, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x17, x17, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x21, x21, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x0, x0, x14 __LF \ + stp x15, x16, [P0] __LF \ + ldp x3, x4, [P1+16] __LF \ + mul x14, x3, x5 __LF \ + adds x17, x17, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x13 __LF \ + adcs x0, x0, x14 __LF \ + cset x15, hs __LF \ + umulh x14, x3, x5 __LF \ + adds x19, x19, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x21, x21, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x3, x12 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x13 __LF \ + adc x15, x15, x14 __LF \ + mul x14, x4, x5 __LF \ + adds x19, x19, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x15, x15, x14 __LF \ + cset x16, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x20, x20, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x21, x21, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x16, x16, x14 __LF \ + stp x17, x19, [P0+16] __LF \ + ldp x3, x4, [P1+32] __LF \ + mul x14, x3, x5 __LF \ + adds x20, x20, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x3, x13 __LF \ + adcs x16, x16, x14 __LF \ + cset x17, hs __LF \ + umulh x14, x3, x5 __LF \ + adds x21, x21, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x3, x12 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x3, x13 __LF \ + adc x17, x17, x14 __LF \ + mul x14, x4, x5 __LF \ + adds x21, x21, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x17, x17, x14 __LF \ + cset x19, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x22, x22, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x19, x19, x14 __LF \ + stp x20, x21, [P0+32] __LF \ + ldp x3, x4, [P1+48] __LF \ + mul x14, x3, x5 __LF \ + adds x22, x22, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x3, x13 __LF \ + adcs x19, x19, x14 __LF \ + cset x20, hs __LF \ + umulh x14, x3, x5 __LF \ + adds x23, x23, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x3, x12 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x3, x13 __LF \ + adc x20, x20, x14 __LF \ + mul x14, x4, x5 __LF \ + adds x23, x23, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x20, x20, x14 __LF \ + cset x21, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x24, x24, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x21, x21, x14 __LF \ + stp x22, x23, [P0+48] __LF \ + ldr x3, [P1+64] __LF \ + mul x14, x3, x5 __LF \ + adds x24, x24, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x3, x13 __LF \ + adc x21, x21, x14 __LF \ + umulh x14, x3, x5 __LF \ + adds x1, x1, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x3, x12 __LF \ + adc x21, x21, x14 __LF \ + cmp xzr, xzr __LF \ + ldp x5, x6, [P0] __LF \ + extr x14, x1, x24, #9 __LF \ + adcs x5, x5, x14 __LF \ + extr x14, x0, x1, #9 __LF \ + adcs x6, x6, x14 __LF \ + ldp x7, x8, [P0+16] __LF \ + extr x14, x15, x0, #9 __LF \ + adcs x7, x7, x14 __LF \ + extr x14, x16, x15, #9 __LF \ + adcs x8, x8, x14 __LF \ + ldp x9, x10, [P0+32] __LF \ + extr x14, x17, x16, #9 __LF \ + adcs x9, x9, x14 __LF \ + extr x14, x19, x17, #9 __LF \ + adcs x10, x10, x14 __LF \ + ldp x11, x12, [P0+48] __LF \ + extr x14, x20, x19, #9 __LF \ + adcs x11, x11, x14 __LF \ + extr x14, x21, x20, #9 __LF \ + adcs x12, x12, x14 __LF \ + orr x13, x24, #0xfffffffffffffe00 __LF \ + lsr x14, x21, #9 __LF \ + adcs x13, x13, x14 __LF \ + sbcs x5, x5, xzr __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, xzr __LF \ + and x13, x13, #0x1ff __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] __LF \ + stp x11, x12, [P0+48] __LF \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sqr_p521_alt + +#define sqr_p521(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x11, x2, x3 __LF \ + umulh x12, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x10, x2, x4 __LF \ + umulh x13, x2, x4 __LF \ + adds x12, x12, x10 __LF \ + ldp x6, x7, [P1+32] __LF \ + mul x10, x2, x5 __LF \ + umulh x14, x2, x5 __LF \ + adcs x13, x13, x10 __LF \ + ldp x8, x9, [P1+48] __LF \ + mul x10, x2, x6 __LF \ + umulh x15, x2, x6 __LF \ + adcs x14, x14, x10 __LF \ + mul x10, x2, x7 __LF \ + umulh x16, x2, x7 __LF \ + adcs x15, x15, x10 __LF \ + mul x10, x2, x8 __LF \ + umulh x17, x2, x8 __LF \ + adcs x16, x16, x10 __LF \ + mul x10, x2, x9 __LF \ + umulh x19, x2, x9 __LF \ + adcs x17, x17, x10 __LF \ + adc x19, x19, xzr __LF \ + mul x10, x3, x4 __LF \ + adds x13, x13, x10 __LF \ + mul x10, x3, x5 __LF \ + adcs x14, x14, x10 __LF \ + mul x10, x3, x6 __LF \ + adcs x15, x15, x10 __LF \ + mul x10, x3, x7 __LF \ + adcs x16, x16, x10 __LF \ + mul x10, x3, x8 __LF \ + adcs x17, x17, x10 __LF \ + mul x10, x3, x9 __LF \ + adcs x19, x19, x10 __LF \ + cset x20, hs __LF \ + umulh x10, x3, x4 __LF \ + adds x14, x14, x10 __LF \ + umulh x10, x3, x5 __LF \ + adcs x15, x15, x10 __LF \ + umulh x10, x3, x6 __LF \ + adcs x16, x16, x10 __LF \ + umulh x10, x3, x7 __LF \ + adcs x17, x17, x10 __LF \ + umulh x10, x3, x8 __LF \ + adcs x19, x19, x10 __LF \ + umulh x10, x3, x9 __LF \ + adc x20, x20, x10 __LF \ + mul x10, x6, x7 __LF \ + umulh x21, x6, x7 __LF \ + adds x20, x20, x10 __LF \ + adc x21, x21, xzr __LF \ + mul x10, x4, x5 __LF \ + adds x15, x15, x10 __LF \ + mul x10, x4, x6 __LF \ + adcs x16, x16, x10 __LF \ + mul x10, x4, x7 __LF \ + adcs x17, x17, x10 __LF \ + mul x10, x4, x8 __LF \ + adcs x19, x19, x10 __LF \ + mul x10, x4, x9 __LF \ + adcs x20, x20, x10 __LF \ + mul x10, x6, x8 __LF \ + adcs x21, x21, x10 __LF \ + cset x22, hs __LF \ + umulh x10, x4, x5 __LF \ + adds x16, x16, x10 __LF \ + umulh x10, x4, x6 __LF \ + adcs x17, x17, x10 __LF \ + umulh x10, x4, x7 __LF \ + adcs x19, x19, x10 __LF \ + umulh x10, x4, x8 __LF \ + adcs x20, x20, x10 __LF \ + umulh x10, x4, x9 __LF \ + adcs x21, x21, x10 __LF \ + umulh x10, x6, x8 __LF \ + adc x22, x22, x10 __LF \ + mul x10, x7, x8 __LF \ + umulh x23, x7, x8 __LF \ + adds x22, x22, x10 __LF \ + adc x23, x23, xzr __LF \ + mul x10, x5, x6 __LF \ + adds x17, x17, x10 __LF \ + mul x10, x5, x7 __LF \ + adcs x19, x19, x10 __LF \ + mul x10, x5, x8 __LF \ + adcs x20, x20, x10 __LF \ + mul x10, x5, x9 __LF \ + adcs x21, x21, x10 __LF \ + mul x10, x6, x9 __LF \ + adcs x22, x22, x10 __LF \ + mul x10, x7, x9 __LF \ + adcs x23, x23, x10 __LF \ + cset x24, hs __LF \ + umulh x10, x5, x6 __LF \ + adds x19, x19, x10 __LF \ + umulh x10, x5, x7 __LF \ + adcs x20, x20, x10 __LF \ + umulh x10, x5, x8 __LF \ + adcs x21, x21, x10 __LF \ + umulh x10, x5, x9 __LF \ + adcs x22, x22, x10 __LF \ + umulh x10, x6, x9 __LF \ + adcs x23, x23, x10 __LF \ + umulh x10, x7, x9 __LF \ + adc x24, x24, x10 __LF \ + mul x10, x8, x9 __LF \ + umulh x25, x8, x9 __LF \ + adds x24, x24, x10 __LF \ + adc x25, x25, xzr __LF \ + adds x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + adcs x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adcs x17, x17, x17 __LF \ + adcs x19, x19, x19 __LF \ + adcs x20, x20, x20 __LF \ + adcs x21, x21, x21 __LF \ + adcs x22, x22, x22 __LF \ + adcs x23, x23, x23 __LF \ + adcs x24, x24, x24 __LF \ + adcs x25, x25, x25 __LF \ + cset x0, hs __LF \ + umulh x10, x2, x2 __LF \ + adds x11, x11, x10 __LF \ + mul x10, x3, x3 __LF \ + adcs x12, x12, x10 __LF \ + umulh x10, x3, x3 __LF \ + adcs x13, x13, x10 __LF \ + mul x10, x4, x4 __LF \ + adcs x14, x14, x10 __LF \ + umulh x10, x4, x4 __LF \ + adcs x15, x15, x10 __LF \ + mul x10, x5, x5 __LF \ + adcs x16, x16, x10 __LF \ + umulh x10, x5, x5 __LF \ + adcs x17, x17, x10 __LF \ + mul x10, x6, x6 __LF \ + adcs x19, x19, x10 __LF \ + umulh x10, x6, x6 __LF \ + adcs x20, x20, x10 __LF \ + mul x10, x7, x7 __LF \ + adcs x21, x21, x10 __LF \ + umulh x10, x7, x7 __LF \ + adcs x22, x22, x10 __LF \ + mul x10, x8, x8 __LF \ + adcs x23, x23, x10 __LF \ + umulh x10, x8, x8 __LF \ + adcs x24, x24, x10 __LF \ + mul x10, x9, x9 __LF \ + adcs x25, x25, x10 __LF \ + umulh x10, x9, x9 __LF \ + adc x0, x0, x10 __LF \ + ldr x1, [P1+64] __LF \ + add x1, x1, x1 __LF \ + mul x10, x1, x2 __LF \ + adds x19, x19, x10 __LF \ + umulh x10, x1, x2 __LF \ + adcs x20, x20, x10 __LF \ + mul x10, x1, x4 __LF \ + adcs x21, x21, x10 __LF \ + umulh x10, x1, x4 __LF \ + adcs x22, x22, x10 __LF \ + mul x10, x1, x6 __LF \ + adcs x23, x23, x10 __LF \ + umulh x10, x1, x6 __LF \ + adcs x24, x24, x10 __LF \ + mul x10, x1, x8 __LF \ + adcs x25, x25, x10 __LF \ + umulh x10, x1, x8 __LF \ + adcs x0, x0, x10 __LF \ + lsr x4, x1, #1 __LF \ + mul x4, x4, x4 __LF \ + adc x4, x4, xzr __LF \ + mul x10, x1, x3 __LF \ + adds x20, x20, x10 __LF \ + umulh x10, x1, x3 __LF \ + adcs x21, x21, x10 __LF \ + mul x10, x1, x5 __LF \ + adcs x22, x22, x10 __LF \ + umulh x10, x1, x5 __LF \ + adcs x23, x23, x10 __LF \ + mul x10, x1, x7 __LF \ + adcs x24, x24, x10 __LF \ + umulh x10, x1, x7 __LF \ + adcs x25, x25, x10 __LF \ + mul x10, x1, x9 __LF \ + adcs x0, x0, x10 __LF \ + umulh x10, x1, x9 __LF \ + adc x4, x4, x10 __LF \ + mul x2, x2, x2 __LF \ + cmp xzr, xzr __LF \ + extr x10, x20, x19, #9 __LF \ + adcs x2, x2, x10 __LF \ + extr x10, x21, x20, #9 __LF \ + adcs x11, x11, x10 __LF \ + extr x10, x22, x21, #9 __LF \ + adcs x12, x12, x10 __LF \ + extr x10, x23, x22, #9 __LF \ + adcs x13, x13, x10 __LF \ + extr x10, x24, x23, #9 __LF \ + adcs x14, x14, x10 __LF \ + extr x10, x25, x24, #9 __LF \ + adcs x15, x15, x10 __LF \ + extr x10, x0, x25, #9 __LF \ + adcs x16, x16, x10 __LF \ + extr x10, x4, x0, #9 __LF \ + adcs x17, x17, x10 __LF \ + orr x19, x19, #0xfffffffffffffe00 __LF \ + lsr x10, x4, #9 __LF \ + adcs x19, x19, x10 __LF \ + sbcs x2, x2, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbcs x15, x15, xzr __LF \ + sbcs x16, x16, xzr __LF \ + sbcs x17, x17, xzr __LF \ + sbc x19, x19, xzr __LF \ + and x19, x19, #0x1ff __LF \ + stp x2, x11, [P0] __LF \ + stp x12, x13, [P0+16] __LF \ + stp x14, x15, [P0+32] __LF \ + stp x16, x17, [P0+48] __LF \ + str x19, [P0+64] + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + ldp x11, x12, [P1+48] __LF \ + ldp x4, x3, [P2+48] __LF \ + sbcs x11, x11, x4 __LF \ + sbcs x12, x12, x3 __LF \ + ldr x13, [P1+64] __LF \ + ldr x4, [P2+64] __LF \ + sbcs x13, x13, x4 __LF \ + sbcs x5, x5, xzr __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + and x13, x13, #0x1ff __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] __LF \ + stp x11, x12, [P0+48] __LF \ + str x13, [P0+64] + +S2N_BN_SYMBOL(p521_jadd_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations + + sqr_p521(z1sq,z_1) + sqr_p521(z2sq,z_2) + + mul_p521(y1a,z_2,y_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,z1sq,x_2) + mul_p521(x1a,z2sq,x_1) + mul_p521(y2a,z1sq,y2a) + mul_p521(y1a,z2sq,y1a) + + sub_p521(xd,x2a,x1a) + sub_p521(yd,y2a,y1a) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x1a) + mul_p521(zzx2,zz,x2a) + + sub_p521(resx,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(xd,xd,z_1) + + sub_p521(resx,resx,zzx2) + + sub_p521(t2,zzx1,resx) + + mul_p521(t1,t1,y1a) + mul_p521(resz,xd,z_2) + mul_p521(t2,yd,t2) + + sub_p521(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 +// Multiplex the z outputs accordingly and re-store in resz + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + ldp x4, x5, [z_1+32] + ldp x6, x7, [z_1+48] + ldr x8, [z_1+64] + + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x23, x6, x7 + orr x20, x20, x21 + orr x22, x22, x23 + orr x20, x20, x8 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne + + ldp x10, x11, [z_2] + ldp x12, x13, [z_2+16] + ldp x14, x15, [z_2+32] + ldp x16, x17, [z_2+48] + ldr x19, [z_2+64] + + orr x21, x10, x11 + orr x22, x12, x13 + orr x23, x14, x15 + orr x24, x16, x17 + orr x21, x21, x22 + orr x23, x23, x24 + orr x21, x21, x19 + orr x21, x21, x23 + + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + + cmp x21, xzr + cset x21, ne + + cmp x21, x20 + + ldp x10, x11, [resz] + ldp x12, x13, [resz+16] + ldp x14, x15, [resz+32] + ldp x16, x17, [resz+48] + ldr x19, [resz+64] + + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + + stp x0, x1, [resz] + stp x2, x3, [resz+16] + stp x4, x5, [resz+32] + stp x6, x7, [resz+48] + str x8, [resz+64] + +// Multiplex the x and y outputs too, keeping the results in registers + + ldp x20, x21, [x_1] + ldp x0, x1, [resx] + csel x0, x20, x0, lo + csel x1, x21, x1, lo + ldp x20, x21, [x_2] + csel x0, x20, x0, hi + csel x1, x21, x1, hi + + ldp x20, x21, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x20, x2, lo + csel x3, x21, x3, lo + ldp x20, x21, [x_2+16] + csel x2, x20, x2, hi + csel x3, x21, x3, hi + + ldp x20, x21, [x_1+32] + ldp x4, x5, [resx+32] + csel x4, x20, x4, lo + csel x5, x21, x5, lo + ldp x20, x21, [x_2+32] + csel x4, x20, x4, hi + csel x5, x21, x5, hi + + ldp x20, x21, [x_1+48] + ldp x6, x7, [resx+48] + csel x6, x20, x6, lo + csel x7, x21, x7, lo + ldp x20, x21, [x_2+48] + csel x6, x20, x6, hi + csel x7, x21, x7, hi + + ldr x20, [x_1+64] + ldr x8, [resx+64] + csel x8, x20, x8, lo + ldr x21, [x_2+64] + csel x8, x21, x8, hi + + + ldp x20, x21, [y_1] + ldp x10, x11, [resy] + csel x10, x20, x10, lo + csel x11, x21, x11, lo + ldp x20, x21, [y_2] + csel x10, x20, x10, hi + csel x11, x21, x11, hi + + ldp x20, x21, [y_1+16] + ldp x12, x13, [resy+16] + csel x12, x20, x12, lo + csel x13, x21, x13, lo + ldp x20, x21, [y_2+16] + csel x12, x20, x12, hi + csel x13, x21, x13, hi + + ldp x20, x21, [y_1+32] + ldp x14, x15, [resy+32] + csel x14, x20, x14, lo + csel x15, x21, x15, lo + ldp x20, x21, [y_2+32] + csel x14, x20, x14, hi + csel x15, x21, x15, hi + + ldp x20, x21, [y_1+48] + ldp x16, x17, [resy+48] + csel x16, x20, x16, lo + csel x17, x21, x17, lo + ldp x20, x21, [y_2+48] + csel x16, x20, x16, hi + csel x17, x21, x17, hi + + ldr x20, [y_1+64] + ldr x19, [resy+64] + csel x19, x20, x19, lo + ldr x21, [y_2+64] + csel x19, x21, x19, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [x_3+48] + str x8, [x_3+64] + + ldp x0, x1, [resz] + ldp x2, x3, [resz+16] + ldp x4, x5, [resz+32] + ldp x6, x7, [resz+48] + ldr x8, [resz+64] + + stp x10, x11, [y_3] + stp x12, x13, [y_3+16] + stp x14, x15, [y_3+32] + stp x16, x17, [y_3+48] + str x19, [y_3+64] + + stp x0, x1, [z_3] + stp x2, x3, [z_3+16] + stp x4, x5, [z_3+32] + stp x6, x7, [z_3+48] + str x8, [z_3+64] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x27, x28, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/p521_jdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jdouble.S similarity index 68% rename from third_party/s2n-bignum/arm/p521/p521_jdouble.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jdouble.S index 73afe4ffbd5..aa441a27ca4 100644 --- a/third_party/s2n-bignum/arm/p521/p521_jdouble.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jdouble.S @@ -65,353 +65,353 @@ // Call local code very close to bignum_mul_p521 and bignum_sqr_p521. #define mul_p521(P0,P1,P2) \ - add x0, P0; \ - add x1, P1; \ - add x2, P2; \ - bl local_mul_p521 + add x0, P0 __LF \ + add x1, P1 __LF \ + add x2, P2 __LF \ + bl p521_jdouble_local_mul_p521 // Call local code equivalent to bignum_sqr_p521 #define sqr_p521(P0,P1) \ - add x0, P0; \ - add x1, P1; \ - bl local_sqr_p521 + add x0, P0 __LF \ + add x1, P1 __LF \ + bl p521_jdouble_local_sqr_p521 // Corresponds exactly to bignum_add_p521 #define add_p521(P0,P1,P2) \ - cmp xzr, xzr; \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - adcs x5, x5, x4; \ - adcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - adcs x7, x7, x4; \ - adcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - adcs x9, x9, x4; \ - adcs x10, x10, x3; \ - ldp x11, x12, [P1+48]; \ - ldp x4, x3, [P2+48]; \ - adcs x11, x11, x4; \ - adcs x12, x12, x3; \ - ldr x13, [P1+64]; \ - ldr x4, [P2+64]; \ - adc x13, x13, x4; \ - subs x4, x13, #512; \ - csetm x4, hs; \ - sbcs x5, x5, xzr; \ - and x4, x4, #0x200; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbc x13, x13, x4; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ + cmp xzr, xzr __LF \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + adcs x5, x5, x4 __LF \ + adcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + adcs x7, x7, x4 __LF \ + adcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x3 __LF \ + ldp x11, x12, [P1+48] __LF \ + ldp x4, x3, [P2+48] __LF \ + adcs x11, x11, x4 __LF \ + adcs x12, x12, x3 __LF \ + ldr x13, [P1+64] __LF \ + ldr x4, [P2+64] __LF \ + adc x13, x13, x4 __LF \ + subs x4, x13, #512 __LF \ + csetm x4, hs __LF \ + sbcs x5, x5, xzr __LF \ + and x4, x4, #0x200 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] __LF \ + stp x11, x12, [P0+48] __LF \ str x13, [P0+64] // Corresponds exactly to bignum_sub_p521 #define sub_p521(P0,P1,P2) \ - ldp x5, x6, [P1]; \ - ldp x4, x3, [P2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [P1+16]; \ - ldp x4, x3, [P2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - ldp x9, x10, [P1+32]; \ - ldp x4, x3, [P2+32]; \ - sbcs x9, x9, x4; \ - sbcs x10, x10, x3; \ - ldp x11, x12, [P1+48]; \ - ldp x4, x3, [P2+48]; \ - sbcs x11, x11, x4; \ - sbcs x12, x12, x3; \ - ldr x13, [P1+64]; \ - ldr x4, [P2+64]; \ - sbcs x13, x13, x4; \ - sbcs x5, x5, xzr; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbcs x10, x10, xzr; \ - sbcs x11, x11, xzr; \ - sbcs x12, x12, xzr; \ - sbcs x13, x13, xzr; \ - and x13, x13, #0x1ff; \ - stp x5, x6, [P0]; \ - stp x7, x8, [P0+16]; \ - stp x9, x10, [P0+32]; \ - stp x11, x12, [P0+48]; \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + ldp x11, x12, [P1+48] __LF \ + ldp x4, x3, [P2+48] __LF \ + sbcs x11, x11, x4 __LF \ + sbcs x12, x12, x3 __LF \ + ldr x13, [P1+64] __LF \ + ldr x4, [P2+64] __LF \ + sbcs x13, x13, x4 __LF \ + sbcs x5, x5, xzr __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + and x13, x13, #0x1ff __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] __LF \ + stp x11, x12, [P0+48] __LF \ str x13, [P0+64] // P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2) #define cmsub_p521(P0,C,P1,D,P2) \ - ldp x6, x7, [P1]; \ - mov x1, #(C); \ - mul x3, x1, x6; \ - mul x4, x1, x7; \ - umulh x6, x1, x6; \ - adds x4, x4, x6; \ - umulh x7, x1, x7; \ - ldp x8, x9, [P1+16]; \ - mul x5, x1, x8; \ - mul x6, x1, x9; \ - umulh x8, x1, x8; \ - adcs x5, x5, x7; \ - umulh x9, x1, x9; \ - adcs x6, x6, x8; \ - ldp x10, x11, [P1+32]; \ - mul x7, x1, x10; \ - mul x8, x1, x11; \ - umulh x10, x1, x10; \ - adcs x7, x7, x9; \ - umulh x11, x1, x11; \ - adcs x8, x8, x10; \ - ldp x12, x13, [P1+48]; \ - mul x9, x1, x12; \ - mul x10, x1, x13; \ - umulh x12, x1, x12; \ - adcs x9, x9, x11; \ - umulh x13, x1, x13; \ - adcs x10, x10, x12; \ - ldr x14, [P1+64]; \ - mul x11, x1, x14; \ - adc x11, x11, x13; \ - mov x1, #(D); \ - ldp x20, x21, [P2]; \ - mvn x20, x20; \ - mul x0, x1, x20; \ - umulh x20, x1, x20; \ - adds x3, x3, x0; \ - mvn x21, x21; \ - mul x0, x1, x21; \ - umulh x21, x1, x21; \ - adcs x4, x4, x0; \ - ldp x22, x23, [P2+16]; \ - mvn x22, x22; \ - mul x0, x1, x22; \ - umulh x22, x1, x22; \ - adcs x5, x5, x0; \ - mvn x23, x23; \ - mul x0, x1, x23; \ - umulh x23, x1, x23; \ - adcs x6, x6, x0; \ - ldp x17, x19, [P2+32]; \ - mvn x17, x17; \ - mul x0, x1, x17; \ - umulh x17, x1, x17; \ - adcs x7, x7, x0; \ - mvn x19, x19; \ - mul x0, x1, x19; \ - umulh x19, x1, x19; \ - adcs x8, x8, x0; \ - ldp x2, x16, [P2+48]; \ - mvn x2, x2; \ - mul x0, x1, x2; \ - umulh x2, x1, x2; \ - adcs x9, x9, x0; \ - mvn x16, x16; \ - mul x0, x1, x16; \ - umulh x16, x1, x16; \ - adcs x10, x10, x0; \ - ldr x0, [P2+64]; \ - eor x0, x0, #0x1ff; \ - mul x0, x1, x0; \ - adc x11, x11, x0; \ - adds x4, x4, x20; \ - adcs x5, x5, x21; \ - and x15, x4, x5; \ - adcs x6, x6, x22; \ - and x15, x15, x6; \ - adcs x7, x7, x23; \ - and x15, x15, x7; \ - adcs x8, x8, x17; \ - and x15, x15, x8; \ - adcs x9, x9, x19; \ - and x15, x15, x9; \ - adcs x10, x10, x2; \ - and x15, x15, x10; \ - adc x11, x11, x16; \ - lsr x12, x11, #9; \ - orr x11, x11, #0xfffffffffffffe00; \ - cmp xzr, xzr; \ - adcs xzr, x3, x12; \ - adcs xzr, x15, xzr; \ - adcs xzr, x11, xzr; \ - adcs x3, x3, x12; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adcs x6, x6, xzr; \ - adcs x7, x7, xzr; \ - adcs x8, x8, xzr; \ - adcs x9, x9, xzr; \ - adcs x10, x10, xzr; \ - adc x11, x11, xzr; \ - and x11, x11, #0x1ff; \ - stp x3, x4, [P0]; \ - stp x5, x6, [P0+16]; \ - stp x7, x8, [P0+32]; \ - stp x9, x10, [P0+48]; \ + ldp x6, x7, [P1] __LF \ + mov x1, #(C) __LF \ + mul x3, x1, x6 __LF \ + mul x4, x1, x7 __LF \ + umulh x6, x1, x6 __LF \ + adds x4, x4, x6 __LF \ + umulh x7, x1, x7 __LF \ + ldp x8, x9, [P1+16] __LF \ + mul x5, x1, x8 __LF \ + mul x6, x1, x9 __LF \ + umulh x8, x1, x8 __LF \ + adcs x5, x5, x7 __LF \ + umulh x9, x1, x9 __LF \ + adcs x6, x6, x8 __LF \ + ldp x10, x11, [P1+32] __LF \ + mul x7, x1, x10 __LF \ + mul x8, x1, x11 __LF \ + umulh x10, x1, x10 __LF \ + adcs x7, x7, x9 __LF \ + umulh x11, x1, x11 __LF \ + adcs x8, x8, x10 __LF \ + ldp x12, x13, [P1+48] __LF \ + mul x9, x1, x12 __LF \ + mul x10, x1, x13 __LF \ + umulh x12, x1, x12 __LF \ + adcs x9, x9, x11 __LF \ + umulh x13, x1, x13 __LF \ + adcs x10, x10, x12 __LF \ + ldr x14, [P1+64] __LF \ + mul x11, x1, x14 __LF \ + adc x11, x11, x13 __LF \ + mov x1, #(D) __LF \ + ldp x20, x21, [P2] __LF \ + mvn x20, x20 __LF \ + mul x0, x1, x20 __LF \ + umulh x20, x1, x20 __LF \ + adds x3, x3, x0 __LF \ + mvn x21, x21 __LF \ + mul x0, x1, x21 __LF \ + umulh x21, x1, x21 __LF \ + adcs x4, x4, x0 __LF \ + ldp x22, x23, [P2+16] __LF \ + mvn x22, x22 __LF \ + mul x0, x1, x22 __LF \ + umulh x22, x1, x22 __LF \ + adcs x5, x5, x0 __LF \ + mvn x23, x23 __LF \ + mul x0, x1, x23 __LF \ + umulh x23, x1, x23 __LF \ + adcs x6, x6, x0 __LF \ + ldp x17, x19, [P2+32] __LF \ + mvn x17, x17 __LF \ + mul x0, x1, x17 __LF \ + umulh x17, x1, x17 __LF \ + adcs x7, x7, x0 __LF \ + mvn x19, x19 __LF \ + mul x0, x1, x19 __LF \ + umulh x19, x1, x19 __LF \ + adcs x8, x8, x0 __LF \ + ldp x2, x16, [P2+48] __LF \ + mvn x2, x2 __LF \ + mul x0, x1, x2 __LF \ + umulh x2, x1, x2 __LF \ + adcs x9, x9, x0 __LF \ + mvn x16, x16 __LF \ + mul x0, x1, x16 __LF \ + umulh x16, x1, x16 __LF \ + adcs x10, x10, x0 __LF \ + ldr x0, [P2+64] __LF \ + eor x0, x0, #0x1ff __LF \ + mul x0, x1, x0 __LF \ + adc x11, x11, x0 __LF \ + adds x4, x4, x20 __LF \ + adcs x5, x5, x21 __LF \ + and x15, x4, x5 __LF \ + adcs x6, x6, x22 __LF \ + and x15, x15, x6 __LF \ + adcs x7, x7, x23 __LF \ + and x15, x15, x7 __LF \ + adcs x8, x8, x17 __LF \ + and x15, x15, x8 __LF \ + adcs x9, x9, x19 __LF \ + and x15, x15, x9 __LF \ + adcs x10, x10, x2 __LF \ + and x15, x15, x10 __LF \ + adc x11, x11, x16 __LF \ + lsr x12, x11, #9 __LF \ + orr x11, x11, #0xfffffffffffffe00 __LF \ + cmp xzr, xzr __LF \ + adcs xzr, x3, x12 __LF \ + adcs xzr, x15, xzr __LF \ + adcs xzr, x11, xzr __LF \ + adcs x3, x3, x12 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adcs x6, x6, xzr __LF \ + adcs x7, x7, xzr __LF \ + adcs x8, x8, xzr __LF \ + adcs x9, x9, xzr __LF \ + adcs x10, x10, xzr __LF \ + adc x11, x11, xzr __LF \ + and x11, x11, #0x1ff __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] __LF \ + stp x7, x8, [P0+32] __LF \ + stp x9, x10, [P0+48] __LF \ str x11, [P0+64] // P0 = 3 * P1 - 8 * P2 == 3 * P1 + 8 * (p_521 - P2) #define cmsub38_p521(P0,P1,P2) \ - ldp x6, x7, [P1]; \ - lsl x3, x6, #1; \ - adds x3, x3, x6; \ - extr x4, x7, x6, #63; \ - adcs x4, x4, x7; \ - ldp x8, x9, [P1+16]; \ - extr x5, x8, x7, #63; \ - adcs x5, x5, x8; \ - extr x6, x9, x8, #63; \ - adcs x6, x6, x9; \ - ldp x10, x11, [P1+32]; \ - extr x7, x10, x9, #63; \ - adcs x7, x7, x10; \ - extr x8, x11, x10, #63; \ - adcs x8, x8, x11; \ - ldp x12, x13, [P1+48]; \ - extr x9, x12, x11, #63; \ - adcs x9, x9, x12; \ - extr x10, x13, x12, #63; \ - adcs x10, x10, x13; \ - ldr x14, [P1+64]; \ - extr x11, x14, x13, #63; \ - adc x11, x11, x14; \ - ldp x20, x21, [P2]; \ - mvn x20, x20; \ - lsl x0, x20, #3; \ - adds x3, x3, x0; \ - mvn x21, x21; \ - extr x0, x21, x20, #61; \ - adcs x4, x4, x0; \ - ldp x22, x23, [P2+16]; \ - mvn x22, x22; \ - extr x0, x22, x21, #61; \ - adcs x5, x5, x0; \ - and x15, x4, x5; \ - mvn x23, x23; \ - extr x0, x23, x22, #61; \ - adcs x6, x6, x0; \ - and x15, x15, x6; \ - ldp x20, x21, [P2+32]; \ - mvn x20, x20; \ - extr x0, x20, x23, #61; \ - adcs x7, x7, x0; \ - and x15, x15, x7; \ - mvn x21, x21; \ - extr x0, x21, x20, #61; \ - adcs x8, x8, x0; \ - and x15, x15, x8; \ - ldp x22, x23, [P2+48]; \ - mvn x22, x22; \ - extr x0, x22, x21, #61; \ - adcs x9, x9, x0; \ - and x15, x15, x9; \ - mvn x23, x23; \ - extr x0, x23, x22, #61; \ - adcs x10, x10, x0; \ - and x15, x15, x10; \ - ldr x0, [P2+64]; \ - eor x0, x0, #0x1ff; \ - extr x0, x0, x23, #61; \ - adc x11, x11, x0; \ - lsr x12, x11, #9; \ - orr x11, x11, #0xfffffffffffffe00; \ - cmp xzr, xzr; \ - adcs xzr, x3, x12; \ - adcs xzr, x15, xzr; \ - adcs xzr, x11, xzr; \ - adcs x3, x3, x12; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adcs x6, x6, xzr; \ - adcs x7, x7, xzr; \ - adcs x8, x8, xzr; \ - adcs x9, x9, xzr; \ - adcs x10, x10, xzr; \ - adc x11, x11, xzr; \ - and x11, x11, #0x1ff; \ - stp x3, x4, [P0]; \ - stp x5, x6, [P0+16]; \ - stp x7, x8, [P0+32]; \ - stp x9, x10, [P0+48]; \ + ldp x6, x7, [P1] __LF \ + lsl x3, x6, #1 __LF \ + adds x3, x3, x6 __LF \ + extr x4, x7, x6, #63 __LF \ + adcs x4, x4, x7 __LF \ + ldp x8, x9, [P1+16] __LF \ + extr x5, x8, x7, #63 __LF \ + adcs x5, x5, x8 __LF \ + extr x6, x9, x8, #63 __LF \ + adcs x6, x6, x9 __LF \ + ldp x10, x11, [P1+32] __LF \ + extr x7, x10, x9, #63 __LF \ + adcs x7, x7, x10 __LF \ + extr x8, x11, x10, #63 __LF \ + adcs x8, x8, x11 __LF \ + ldp x12, x13, [P1+48] __LF \ + extr x9, x12, x11, #63 __LF \ + adcs x9, x9, x12 __LF \ + extr x10, x13, x12, #63 __LF \ + adcs x10, x10, x13 __LF \ + ldr x14, [P1+64] __LF \ + extr x11, x14, x13, #63 __LF \ + adc x11, x11, x14 __LF \ + ldp x20, x21, [P2] __LF \ + mvn x20, x20 __LF \ + lsl x0, x20, #3 __LF \ + adds x3, x3, x0 __LF \ + mvn x21, x21 __LF \ + extr x0, x21, x20, #61 __LF \ + adcs x4, x4, x0 __LF \ + ldp x22, x23, [P2+16] __LF \ + mvn x22, x22 __LF \ + extr x0, x22, x21, #61 __LF \ + adcs x5, x5, x0 __LF \ + and x15, x4, x5 __LF \ + mvn x23, x23 __LF \ + extr x0, x23, x22, #61 __LF \ + adcs x6, x6, x0 __LF \ + and x15, x15, x6 __LF \ + ldp x20, x21, [P2+32] __LF \ + mvn x20, x20 __LF \ + extr x0, x20, x23, #61 __LF \ + adcs x7, x7, x0 __LF \ + and x15, x15, x7 __LF \ + mvn x21, x21 __LF \ + extr x0, x21, x20, #61 __LF \ + adcs x8, x8, x0 __LF \ + and x15, x15, x8 __LF \ + ldp x22, x23, [P2+48] __LF \ + mvn x22, x22 __LF \ + extr x0, x22, x21, #61 __LF \ + adcs x9, x9, x0 __LF \ + and x15, x15, x9 __LF \ + mvn x23, x23 __LF \ + extr x0, x23, x22, #61 __LF \ + adcs x10, x10, x0 __LF \ + and x15, x15, x10 __LF \ + ldr x0, [P2+64] __LF \ + eor x0, x0, #0x1ff __LF \ + extr x0, x0, x23, #61 __LF \ + adc x11, x11, x0 __LF \ + lsr x12, x11, #9 __LF \ + orr x11, x11, #0xfffffffffffffe00 __LF \ + cmp xzr, xzr __LF \ + adcs xzr, x3, x12 __LF \ + adcs xzr, x15, xzr __LF \ + adcs xzr, x11, xzr __LF \ + adcs x3, x3, x12 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adcs x6, x6, xzr __LF \ + adcs x7, x7, xzr __LF \ + adcs x8, x8, xzr __LF \ + adcs x9, x9, xzr __LF \ + adcs x10, x10, xzr __LF \ + adc x11, x11, xzr __LF \ + and x11, x11, #0x1ff __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] __LF \ + stp x7, x8, [P0+32] __LF \ + stp x9, x10, [P0+48] __LF \ str x11, [P0+64] // P0 = 4 * P1 - P2 = 4 * P1 + (p_521 - P2) #define cmsub41_p521(P0,P1,P2) \ - ldp x6, x7, [P1]; \ - lsl x3, x6, #2; \ - extr x4, x7, x6, #62; \ - ldp x8, x9, [P1+16]; \ - extr x5, x8, x7, #62; \ - extr x6, x9, x8, #62; \ - ldp x10, x11, [P1+32]; \ - extr x7, x10, x9, #62; \ - extr x8, x11, x10, #62; \ - ldp x12, x13, [P1+48]; \ - extr x9, x12, x11, #62; \ - extr x10, x13, x12, #62; \ - ldr x14, [P1+64]; \ - extr x11, x14, x13, #62; \ - ldp x0, x1, [P2]; \ - mvn x0, x0; \ - adds x3, x3, x0; \ - sbcs x4, x4, x1; \ - ldp x0, x1, [P2+16]; \ - sbcs x5, x5, x0; \ - and x15, x4, x5; \ - sbcs x6, x6, x1; \ - and x15, x15, x6; \ - ldp x0, x1, [P2+32]; \ - sbcs x7, x7, x0; \ - and x15, x15, x7; \ - sbcs x8, x8, x1; \ - and x15, x15, x8; \ - ldp x0, x1, [P2+48]; \ - sbcs x9, x9, x0; \ - and x15, x15, x9; \ - sbcs x10, x10, x1; \ - and x15, x15, x10; \ - ldr x0, [P2+64]; \ - eor x0, x0, #0x1ff; \ - adc x11, x11, x0; \ - lsr x12, x11, #9; \ - orr x11, x11, #0xfffffffffffffe00; \ - cmp xzr, xzr; \ - adcs xzr, x3, x12; \ - adcs xzr, x15, xzr; \ - adcs xzr, x11, xzr; \ - adcs x3, x3, x12; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adcs x6, x6, xzr; \ - adcs x7, x7, xzr; \ - adcs x8, x8, xzr; \ - adcs x9, x9, xzr; \ - adcs x10, x10, xzr; \ - adc x11, x11, xzr; \ - and x11, x11, #0x1ff; \ - stp x3, x4, [P0]; \ - stp x5, x6, [P0+16]; \ - stp x7, x8, [P0+32]; \ - stp x9, x10, [P0+48]; \ + ldp x6, x7, [P1] __LF \ + lsl x3, x6, #2 __LF \ + extr x4, x7, x6, #62 __LF \ + ldp x8, x9, [P1+16] __LF \ + extr x5, x8, x7, #62 __LF \ + extr x6, x9, x8, #62 __LF \ + ldp x10, x11, [P1+32] __LF \ + extr x7, x10, x9, #62 __LF \ + extr x8, x11, x10, #62 __LF \ + ldp x12, x13, [P1+48] __LF \ + extr x9, x12, x11, #62 __LF \ + extr x10, x13, x12, #62 __LF \ + ldr x14, [P1+64] __LF \ + extr x11, x14, x13, #62 __LF \ + ldp x0, x1, [P2] __LF \ + mvn x0, x0 __LF \ + adds x3, x3, x0 __LF \ + sbcs x4, x4, x1 __LF \ + ldp x0, x1, [P2+16] __LF \ + sbcs x5, x5, x0 __LF \ + and x15, x4, x5 __LF \ + sbcs x6, x6, x1 __LF \ + and x15, x15, x6 __LF \ + ldp x0, x1, [P2+32] __LF \ + sbcs x7, x7, x0 __LF \ + and x15, x15, x7 __LF \ + sbcs x8, x8, x1 __LF \ + and x15, x15, x8 __LF \ + ldp x0, x1, [P2+48] __LF \ + sbcs x9, x9, x0 __LF \ + and x15, x15, x9 __LF \ + sbcs x10, x10, x1 __LF \ + and x15, x15, x10 __LF \ + ldr x0, [P2+64] __LF \ + eor x0, x0, #0x1ff __LF \ + adc x11, x11, x0 __LF \ + lsr x12, x11, #9 __LF \ + orr x11, x11, #0xfffffffffffffe00 __LF \ + cmp xzr, xzr __LF \ + adcs xzr, x3, x12 __LF \ + adcs xzr, x15, xzr __LF \ + adcs xzr, x11, xzr __LF \ + adcs x3, x3, x12 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adcs x6, x6, xzr __LF \ + adcs x7, x7, xzr __LF \ + adcs x8, x8, xzr __LF \ + adcs x9, x9, xzr __LF \ + adcs x10, x10, xzr __LF \ + adc x11, x11, xzr __LF \ + and x11, x11, #0x1ff __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] __LF \ + stp x7, x8, [P0+32] __LF \ + stp x9, x10, [P0+48] __LF \ str x11, [P0+64] S2N_BN_SYMBOL(p521_jdouble): @@ -494,9 +494,9 @@ S2N_BN_SYMBOL(p521_jdouble): ret // Local versions of the two "big" field operations, identical to -// bignum_mul_p521_neon and bignum_sqr_p521_neon. +// bignum_mul_p521 and bignum_sqr_p521. -local_mul_p521: +p521_jdouble_local_mul_p521: stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! @@ -1173,7 +1173,7 @@ local_mul_p521: ldp x19, x20, [sp], #16 ret -local_sqr_p521: +p521_jdouble_local_sqr_p521: stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jdouble_alt.S new file mode 100644 index 00000000000..691e62bd0eb --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jdouble_alt.S @@ -0,0 +1,1458 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jdouble_alt +// (uint64_t p3[static 27],uint64_t p1[static 27]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input point are fully +// reduced mod p_521 and that the z coordinate is not zero. +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jdouble_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence + +#define input_z x26 +#define input_x x27 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries + +#define z2 sp, #(NUMSIZE*0) +#define y2 sp, #(NUMSIZE*1) +#define x2p sp, #(NUMSIZE*2) +#define xy2 sp, #(NUMSIZE*3) + +#define y4 sp, #(NUMSIZE*4) +#define t2 sp, #(NUMSIZE*4) + +#define dx2 sp, #(NUMSIZE*5) +#define t1 sp, #(NUMSIZE*5) + +#define d sp, #(NUMSIZE*6) +#define x4p sp, #(NUMSIZE*6) + +// NUMSIZE*7 is not 16-aligned so we round it up + +#define NSPACE (NUMSIZE*7+8) + +// Corresponds exactly to bignum_mul_p521_alt + +#define mul_p521(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + mul x15, x3, x5 __LF \ + umulh x16, x3, x5 __LF \ + mul x14, x3, x6 __LF \ + umulh x17, x3, x6 __LF \ + adds x16, x16, x14 __LF \ + ldp x7, x8, [P2+16] __LF \ + mul x14, x3, x7 __LF \ + umulh x19, x3, x7 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x3, x8 __LF \ + umulh x20, x3, x8 __LF \ + adcs x19, x19, x14 __LF \ + ldp x9, x10, [P2+32] __LF \ + mul x14, x3, x9 __LF \ + umulh x21, x3, x9 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x3, x10 __LF \ + umulh x22, x3, x10 __LF \ + adcs x21, x21, x14 __LF \ + ldp x11, x12, [P2+48] __LF \ + mul x14, x3, x11 __LF \ + umulh x23, x3, x11 __LF \ + adcs x22, x22, x14 __LF \ + ldr x13, [P2+64] __LF \ + mul x14, x3, x12 __LF \ + umulh x24, x3, x12 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x13 __LF \ + umulh x1, x3, x13 __LF \ + adcs x24, x24, x14 __LF \ + adc x1, x1, xzr __LF \ + mul x14, x4, x5 __LF \ + adds x16, x16, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x1, x1, x14 __LF \ + cset x0, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x17, x17, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x21, x21, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x0, x0, x14 __LF \ + stp x15, x16, [P0] __LF \ + ldp x3, x4, [P1+16] __LF \ + mul x14, x3, x5 __LF \ + adds x17, x17, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x13 __LF \ + adcs x0, x0, x14 __LF \ + cset x15, hs __LF \ + umulh x14, x3, x5 __LF \ + adds x19, x19, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x21, x21, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x3, x12 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x13 __LF \ + adc x15, x15, x14 __LF \ + mul x14, x4, x5 __LF \ + adds x19, x19, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x15, x15, x14 __LF \ + cset x16, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x20, x20, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x21, x21, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x16, x16, x14 __LF \ + stp x17, x19, [P0+16] __LF \ + ldp x3, x4, [P1+32] __LF \ + mul x14, x3, x5 __LF \ + adds x20, x20, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x3, x13 __LF \ + adcs x16, x16, x14 __LF \ + cset x17, hs __LF \ + umulh x14, x3, x5 __LF \ + adds x21, x21, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x3, x12 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x3, x13 __LF \ + adc x17, x17, x14 __LF \ + mul x14, x4, x5 __LF \ + adds x21, x21, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x17, x17, x14 __LF \ + cset x19, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x22, x22, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x19, x19, x14 __LF \ + stp x20, x21, [P0+32] __LF \ + ldp x3, x4, [P1+48] __LF \ + mul x14, x3, x5 __LF \ + adds x22, x22, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x3, x13 __LF \ + adcs x19, x19, x14 __LF \ + cset x20, hs __LF \ + umulh x14, x3, x5 __LF \ + adds x23, x23, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x3, x12 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x3, x13 __LF \ + adc x20, x20, x14 __LF \ + mul x14, x4, x5 __LF \ + adds x23, x23, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x20, x20, x14 __LF \ + cset x21, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x24, x24, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x21, x21, x14 __LF \ + stp x22, x23, [P0+48] __LF \ + ldr x3, [P1+64] __LF \ + mul x14, x3, x5 __LF \ + adds x24, x24, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x3, x13 __LF \ + adc x21, x21, x14 __LF \ + umulh x14, x3, x5 __LF \ + adds x1, x1, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x3, x12 __LF \ + adc x21, x21, x14 __LF \ + cmp xzr, xzr __LF \ + ldp x5, x6, [P0] __LF \ + extr x14, x1, x24, #9 __LF \ + adcs x5, x5, x14 __LF \ + extr x14, x0, x1, #9 __LF \ + adcs x6, x6, x14 __LF \ + ldp x7, x8, [P0+16] __LF \ + extr x14, x15, x0, #9 __LF \ + adcs x7, x7, x14 __LF \ + extr x14, x16, x15, #9 __LF \ + adcs x8, x8, x14 __LF \ + ldp x9, x10, [P0+32] __LF \ + extr x14, x17, x16, #9 __LF \ + adcs x9, x9, x14 __LF \ + extr x14, x19, x17, #9 __LF \ + adcs x10, x10, x14 __LF \ + ldp x11, x12, [P0+48] __LF \ + extr x14, x20, x19, #9 __LF \ + adcs x11, x11, x14 __LF \ + extr x14, x21, x20, #9 __LF \ + adcs x12, x12, x14 __LF \ + orr x13, x24, #0xfffffffffffffe00 __LF \ + lsr x14, x21, #9 __LF \ + adcs x13, x13, x14 __LF \ + sbcs x5, x5, xzr __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, xzr __LF \ + and x13, x13, #0x1ff __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] __LF \ + stp x11, x12, [P0+48] __LF \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sqr_p521_alt + +#define sqr_p521(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x11, x2, x3 __LF \ + umulh x12, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x10, x2, x4 __LF \ + umulh x13, x2, x4 __LF \ + adds x12, x12, x10 __LF \ + ldp x6, x7, [P1+32] __LF \ + mul x10, x2, x5 __LF \ + umulh x14, x2, x5 __LF \ + adcs x13, x13, x10 __LF \ + ldp x8, x9, [P1+48] __LF \ + mul x10, x2, x6 __LF \ + umulh x15, x2, x6 __LF \ + adcs x14, x14, x10 __LF \ + mul x10, x2, x7 __LF \ + umulh x16, x2, x7 __LF \ + adcs x15, x15, x10 __LF \ + mul x10, x2, x8 __LF \ + umulh x17, x2, x8 __LF \ + adcs x16, x16, x10 __LF \ + mul x10, x2, x9 __LF \ + umulh x19, x2, x9 __LF \ + adcs x17, x17, x10 __LF \ + adc x19, x19, xzr __LF \ + mul x10, x3, x4 __LF \ + adds x13, x13, x10 __LF \ + mul x10, x3, x5 __LF \ + adcs x14, x14, x10 __LF \ + mul x10, x3, x6 __LF \ + adcs x15, x15, x10 __LF \ + mul x10, x3, x7 __LF \ + adcs x16, x16, x10 __LF \ + mul x10, x3, x8 __LF \ + adcs x17, x17, x10 __LF \ + mul x10, x3, x9 __LF \ + adcs x19, x19, x10 __LF \ + cset x20, hs __LF \ + umulh x10, x3, x4 __LF \ + adds x14, x14, x10 __LF \ + umulh x10, x3, x5 __LF \ + adcs x15, x15, x10 __LF \ + umulh x10, x3, x6 __LF \ + adcs x16, x16, x10 __LF \ + umulh x10, x3, x7 __LF \ + adcs x17, x17, x10 __LF \ + umulh x10, x3, x8 __LF \ + adcs x19, x19, x10 __LF \ + umulh x10, x3, x9 __LF \ + adc x20, x20, x10 __LF \ + mul x10, x6, x7 __LF \ + umulh x21, x6, x7 __LF \ + adds x20, x20, x10 __LF \ + adc x21, x21, xzr __LF \ + mul x10, x4, x5 __LF \ + adds x15, x15, x10 __LF \ + mul x10, x4, x6 __LF \ + adcs x16, x16, x10 __LF \ + mul x10, x4, x7 __LF \ + adcs x17, x17, x10 __LF \ + mul x10, x4, x8 __LF \ + adcs x19, x19, x10 __LF \ + mul x10, x4, x9 __LF \ + adcs x20, x20, x10 __LF \ + mul x10, x6, x8 __LF \ + adcs x21, x21, x10 __LF \ + cset x22, hs __LF \ + umulh x10, x4, x5 __LF \ + adds x16, x16, x10 __LF \ + umulh x10, x4, x6 __LF \ + adcs x17, x17, x10 __LF \ + umulh x10, x4, x7 __LF \ + adcs x19, x19, x10 __LF \ + umulh x10, x4, x8 __LF \ + adcs x20, x20, x10 __LF \ + umulh x10, x4, x9 __LF \ + adcs x21, x21, x10 __LF \ + umulh x10, x6, x8 __LF \ + adc x22, x22, x10 __LF \ + mul x10, x7, x8 __LF \ + umulh x23, x7, x8 __LF \ + adds x22, x22, x10 __LF \ + adc x23, x23, xzr __LF \ + mul x10, x5, x6 __LF \ + adds x17, x17, x10 __LF \ + mul x10, x5, x7 __LF \ + adcs x19, x19, x10 __LF \ + mul x10, x5, x8 __LF \ + adcs x20, x20, x10 __LF \ + mul x10, x5, x9 __LF \ + adcs x21, x21, x10 __LF \ + mul x10, x6, x9 __LF \ + adcs x22, x22, x10 __LF \ + mul x10, x7, x9 __LF \ + adcs x23, x23, x10 __LF \ + cset x24, hs __LF \ + umulh x10, x5, x6 __LF \ + adds x19, x19, x10 __LF \ + umulh x10, x5, x7 __LF \ + adcs x20, x20, x10 __LF \ + umulh x10, x5, x8 __LF \ + adcs x21, x21, x10 __LF \ + umulh x10, x5, x9 __LF \ + adcs x22, x22, x10 __LF \ + umulh x10, x6, x9 __LF \ + adcs x23, x23, x10 __LF \ + umulh x10, x7, x9 __LF \ + adc x24, x24, x10 __LF \ + mul x10, x8, x9 __LF \ + umulh x25, x8, x9 __LF \ + adds x24, x24, x10 __LF \ + adc x25, x25, xzr __LF \ + adds x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + adcs x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adcs x17, x17, x17 __LF \ + adcs x19, x19, x19 __LF \ + adcs x20, x20, x20 __LF \ + adcs x21, x21, x21 __LF \ + adcs x22, x22, x22 __LF \ + adcs x23, x23, x23 __LF \ + adcs x24, x24, x24 __LF \ + adcs x25, x25, x25 __LF \ + cset x0, hs __LF \ + umulh x10, x2, x2 __LF \ + adds x11, x11, x10 __LF \ + mul x10, x3, x3 __LF \ + adcs x12, x12, x10 __LF \ + umulh x10, x3, x3 __LF \ + adcs x13, x13, x10 __LF \ + mul x10, x4, x4 __LF \ + adcs x14, x14, x10 __LF \ + umulh x10, x4, x4 __LF \ + adcs x15, x15, x10 __LF \ + mul x10, x5, x5 __LF \ + adcs x16, x16, x10 __LF \ + umulh x10, x5, x5 __LF \ + adcs x17, x17, x10 __LF \ + mul x10, x6, x6 __LF \ + adcs x19, x19, x10 __LF \ + umulh x10, x6, x6 __LF \ + adcs x20, x20, x10 __LF \ + mul x10, x7, x7 __LF \ + adcs x21, x21, x10 __LF \ + umulh x10, x7, x7 __LF \ + adcs x22, x22, x10 __LF \ + mul x10, x8, x8 __LF \ + adcs x23, x23, x10 __LF \ + umulh x10, x8, x8 __LF \ + adcs x24, x24, x10 __LF \ + mul x10, x9, x9 __LF \ + adcs x25, x25, x10 __LF \ + umulh x10, x9, x9 __LF \ + adc x0, x0, x10 __LF \ + ldr x1, [P1+64] __LF \ + add x1, x1, x1 __LF \ + mul x10, x1, x2 __LF \ + adds x19, x19, x10 __LF \ + umulh x10, x1, x2 __LF \ + adcs x20, x20, x10 __LF \ + mul x10, x1, x4 __LF \ + adcs x21, x21, x10 __LF \ + umulh x10, x1, x4 __LF \ + adcs x22, x22, x10 __LF \ + mul x10, x1, x6 __LF \ + adcs x23, x23, x10 __LF \ + umulh x10, x1, x6 __LF \ + adcs x24, x24, x10 __LF \ + mul x10, x1, x8 __LF \ + adcs x25, x25, x10 __LF \ + umulh x10, x1, x8 __LF \ + adcs x0, x0, x10 __LF \ + lsr x4, x1, #1 __LF \ + mul x4, x4, x4 __LF \ + adc x4, x4, xzr __LF \ + mul x10, x1, x3 __LF \ + adds x20, x20, x10 __LF \ + umulh x10, x1, x3 __LF \ + adcs x21, x21, x10 __LF \ + mul x10, x1, x5 __LF \ + adcs x22, x22, x10 __LF \ + umulh x10, x1, x5 __LF \ + adcs x23, x23, x10 __LF \ + mul x10, x1, x7 __LF \ + adcs x24, x24, x10 __LF \ + umulh x10, x1, x7 __LF \ + adcs x25, x25, x10 __LF \ + mul x10, x1, x9 __LF \ + adcs x0, x0, x10 __LF \ + umulh x10, x1, x9 __LF \ + adc x4, x4, x10 __LF \ + mul x2, x2, x2 __LF \ + cmp xzr, xzr __LF \ + extr x10, x20, x19, #9 __LF \ + adcs x2, x2, x10 __LF \ + extr x10, x21, x20, #9 __LF \ + adcs x11, x11, x10 __LF \ + extr x10, x22, x21, #9 __LF \ + adcs x12, x12, x10 __LF \ + extr x10, x23, x22, #9 __LF \ + adcs x13, x13, x10 __LF \ + extr x10, x24, x23, #9 __LF \ + adcs x14, x14, x10 __LF \ + extr x10, x25, x24, #9 __LF \ + adcs x15, x15, x10 __LF \ + extr x10, x0, x25, #9 __LF \ + adcs x16, x16, x10 __LF \ + extr x10, x4, x0, #9 __LF \ + adcs x17, x17, x10 __LF \ + orr x19, x19, #0xfffffffffffffe00 __LF \ + lsr x10, x4, #9 __LF \ + adcs x19, x19, x10 __LF \ + sbcs x2, x2, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbcs x15, x15, xzr __LF \ + sbcs x16, x16, xzr __LF \ + sbcs x17, x17, xzr __LF \ + sbc x19, x19, xzr __LF \ + and x19, x19, #0x1ff __LF \ + stp x2, x11, [P0] __LF \ + stp x12, x13, [P0+16] __LF \ + stp x14, x15, [P0+32] __LF \ + stp x16, x17, [P0+48] __LF \ + str x19, [P0+64] + +// Corresponds exactly to bignum_add_p521 + +#define add_p521(P0,P1,P2) \ + cmp xzr, xzr __LF \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + adcs x5, x5, x4 __LF \ + adcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + adcs x7, x7, x4 __LF \ + adcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + adcs x9, x9, x4 __LF \ + adcs x10, x10, x3 __LF \ + ldp x11, x12, [P1+48] __LF \ + ldp x4, x3, [P2+48] __LF \ + adcs x11, x11, x4 __LF \ + adcs x12, x12, x3 __LF \ + ldr x13, [P1+64] __LF \ + ldr x4, [P2+64] __LF \ + adc x13, x13, x4 __LF \ + subs x4, x13, #512 __LF \ + csetm x4, hs __LF \ + sbcs x5, x5, xzr __LF \ + and x4, x4, #0x200 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] __LF \ + stp x11, x12, [P0+48] __LF \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + ldp x11, x12, [P1+48] __LF \ + ldp x4, x3, [P2+48] __LF \ + sbcs x11, x11, x4 __LF \ + sbcs x12, x12, x3 __LF \ + ldr x13, [P1+64] __LF \ + ldr x4, [P2+64] __LF \ + sbcs x13, x13, x4 __LF \ + sbcs x5, x5, xzr __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + and x13, x13, #0x1ff __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] __LF \ + stp x11, x12, [P0+48] __LF \ + str x13, [P0+64] + +// Weak multiplication not fully reducing + +#define weakmul_p521(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + mul x15, x3, x5 __LF \ + umulh x16, x3, x5 __LF \ + mul x14, x3, x6 __LF \ + umulh x17, x3, x6 __LF \ + adds x16, x16, x14 __LF \ + ldp x7, x8, [P2+16] __LF \ + mul x14, x3, x7 __LF \ + umulh x19, x3, x7 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x3, x8 __LF \ + umulh x20, x3, x8 __LF \ + adcs x19, x19, x14 __LF \ + ldp x9, x10, [P2+32] __LF \ + mul x14, x3, x9 __LF \ + umulh x21, x3, x9 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x3, x10 __LF \ + umulh x22, x3, x10 __LF \ + adcs x21, x21, x14 __LF \ + ldp x11, x12, [P2+48] __LF \ + mul x14, x3, x11 __LF \ + umulh x23, x3, x11 __LF \ + adcs x22, x22, x14 __LF \ + ldr x13, [P2+64] __LF \ + mul x14, x3, x12 __LF \ + umulh x24, x3, x12 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x13 __LF \ + umulh x1, x3, x13 __LF \ + adcs x24, x24, x14 __LF \ + adc x1, x1, xzr __LF \ + mul x14, x4, x5 __LF \ + adds x16, x16, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x1, x1, x14 __LF \ + cset x0, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x17, x17, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x21, x21, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x0, x0, x14 __LF \ + stp x15, x16, [P0] __LF \ + ldp x3, x4, [P1+16] __LF \ + mul x14, x3, x5 __LF \ + adds x17, x17, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x13 __LF \ + adcs x0, x0, x14 __LF \ + cset x15, hs __LF \ + umulh x14, x3, x5 __LF \ + adds x19, x19, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x21, x21, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x3, x12 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x13 __LF \ + adc x15, x15, x14 __LF \ + mul x14, x4, x5 __LF \ + adds x19, x19, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x15, x15, x14 __LF \ + cset x16, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x20, x20, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x21, x21, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x16, x16, x14 __LF \ + stp x17, x19, [P0+16] __LF \ + ldp x3, x4, [P1+32] __LF \ + mul x14, x3, x5 __LF \ + adds x20, x20, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x3, x13 __LF \ + adcs x16, x16, x14 __LF \ + cset x17, hs __LF \ + umulh x14, x3, x5 __LF \ + adds x21, x21, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x3, x12 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x3, x13 __LF \ + adc x17, x17, x14 __LF \ + mul x14, x4, x5 __LF \ + adds x21, x21, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x17, x17, x14 __LF \ + cset x19, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x22, x22, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x19, x19, x14 __LF \ + stp x20, x21, [P0+32] __LF \ + ldp x3, x4, [P1+48] __LF \ + mul x14, x3, x5 __LF \ + adds x22, x22, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x3, x13 __LF \ + adcs x19, x19, x14 __LF \ + cset x20, hs __LF \ + umulh x14, x3, x5 __LF \ + adds x23, x23, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x3, x12 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x3, x13 __LF \ + adc x20, x20, x14 __LF \ + mul x14, x4, x5 __LF \ + adds x23, x23, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x20, x20, x14 __LF \ + cset x21, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x24, x24, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x21, x21, x14 __LF \ + stp x22, x23, [P0+48] __LF \ + ldr x3, [P1+64] __LF \ + mul x14, x3, x5 __LF \ + adds x24, x24, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x3, x13 __LF \ + adc x21, x21, x14 __LF \ + umulh x14, x3, x5 __LF \ + adds x1, x1, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x3, x12 __LF \ + adc x21, x21, x14 __LF \ + ldp x5, x6, [P0] __LF \ + extr x14, x1, x24, #9 __LF \ + adds x5, x5, x14 __LF \ + extr x14, x0, x1, #9 __LF \ + adcs x6, x6, x14 __LF \ + ldp x7, x8, [P0+16] __LF \ + extr x14, x15, x0, #9 __LF \ + adcs x7, x7, x14 __LF \ + extr x14, x16, x15, #9 __LF \ + adcs x8, x8, x14 __LF \ + ldp x9, x10, [P0+32] __LF \ + extr x14, x17, x16, #9 __LF \ + adcs x9, x9, x14 __LF \ + extr x14, x19, x17, #9 __LF \ + adcs x10, x10, x14 __LF \ + ldp x11, x12, [P0+48] __LF \ + extr x14, x20, x19, #9 __LF \ + adcs x11, x11, x14 __LF \ + extr x14, x21, x20, #9 __LF \ + adcs x12, x12, x14 __LF \ + and x13, x24, #0x1ff __LF \ + lsr x14, x21, #9 __LF \ + adc x13, x13, x14 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] __LF \ + stp x11, x12, [P0+48] __LF \ + str x13, [P0+64] + +// P0 = C * P1 - D * P2 == C * P1 + D * (p_521 - P2) + +#define cmsub_p521(P0,C,P1,D,P2) \ + ldp x6, x7, [P1] __LF \ + mov x1, #(C) __LF \ + mul x3, x1, x6 __LF \ + mul x4, x1, x7 __LF \ + umulh x6, x1, x6 __LF \ + adds x4, x4, x6 __LF \ + umulh x7, x1, x7 __LF \ + ldp x8, x9, [P1+16] __LF \ + mul x5, x1, x8 __LF \ + mul x6, x1, x9 __LF \ + umulh x8, x1, x8 __LF \ + adcs x5, x5, x7 __LF \ + umulh x9, x1, x9 __LF \ + adcs x6, x6, x8 __LF \ + ldp x10, x11, [P1+32] __LF \ + mul x7, x1, x10 __LF \ + mul x8, x1, x11 __LF \ + umulh x10, x1, x10 __LF \ + adcs x7, x7, x9 __LF \ + umulh x11, x1, x11 __LF \ + adcs x8, x8, x10 __LF \ + ldp x12, x13, [P1+48] __LF \ + mul x9, x1, x12 __LF \ + mul x10, x1, x13 __LF \ + umulh x12, x1, x12 __LF \ + adcs x9, x9, x11 __LF \ + umulh x13, x1, x13 __LF \ + adcs x10, x10, x12 __LF \ + ldr x14, [P1+64] __LF \ + mul x11, x1, x14 __LF \ + adc x11, x11, x13 __LF \ + mov x1, #(D) __LF \ + ldp x20, x21, [P2] __LF \ + mvn x20, x20 __LF \ + mul x0, x1, x20 __LF \ + umulh x20, x1, x20 __LF \ + adds x3, x3, x0 __LF \ + mvn x21, x21 __LF \ + mul x0, x1, x21 __LF \ + umulh x21, x1, x21 __LF \ + adcs x4, x4, x0 __LF \ + ldp x22, x23, [P2+16] __LF \ + mvn x22, x22 __LF \ + mul x0, x1, x22 __LF \ + umulh x22, x1, x22 __LF \ + adcs x5, x5, x0 __LF \ + mvn x23, x23 __LF \ + mul x0, x1, x23 __LF \ + umulh x23, x1, x23 __LF \ + adcs x6, x6, x0 __LF \ + ldp x17, x19, [P2+32] __LF \ + mvn x17, x17 __LF \ + mul x0, x1, x17 __LF \ + umulh x17, x1, x17 __LF \ + adcs x7, x7, x0 __LF \ + mvn x19, x19 __LF \ + mul x0, x1, x19 __LF \ + umulh x19, x1, x19 __LF \ + adcs x8, x8, x0 __LF \ + ldp x2, x16, [P2+48] __LF \ + mvn x2, x2 __LF \ + mul x0, x1, x2 __LF \ + umulh x2, x1, x2 __LF \ + adcs x9, x9, x0 __LF \ + mvn x16, x16 __LF \ + mul x0, x1, x16 __LF \ + umulh x16, x1, x16 __LF \ + adcs x10, x10, x0 __LF \ + ldr x0, [P2+64] __LF \ + eor x0, x0, #0x1ff __LF \ + mul x0, x1, x0 __LF \ + adc x11, x11, x0 __LF \ + adds x4, x4, x20 __LF \ + adcs x5, x5, x21 __LF \ + and x15, x4, x5 __LF \ + adcs x6, x6, x22 __LF \ + and x15, x15, x6 __LF \ + adcs x7, x7, x23 __LF \ + and x15, x15, x7 __LF \ + adcs x8, x8, x17 __LF \ + and x15, x15, x8 __LF \ + adcs x9, x9, x19 __LF \ + and x15, x15, x9 __LF \ + adcs x10, x10, x2 __LF \ + and x15, x15, x10 __LF \ + adc x11, x11, x16 __LF \ + lsr x12, x11, #9 __LF \ + orr x11, x11, #0xfffffffffffffe00 __LF \ + cmp xzr, xzr __LF \ + adcs xzr, x3, x12 __LF \ + adcs xzr, x15, xzr __LF \ + adcs xzr, x11, xzr __LF \ + adcs x3, x3, x12 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adcs x6, x6, xzr __LF \ + adcs x7, x7, xzr __LF \ + adcs x8, x8, xzr __LF \ + adcs x9, x9, xzr __LF \ + adcs x10, x10, xzr __LF \ + adc x11, x11, xzr __LF \ + and x11, x11, #0x1ff __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] __LF \ + stp x7, x8, [P0+32] __LF \ + stp x9, x10, [P0+48] __LF \ + str x11, [P0+64] + +// P0 = 3 * P1 - 8 * P2 == 3 * P1 + 8 * (p_521 - P2) + +#define cmsub38_p521(P0,P1,P2) \ + ldp x6, x7, [P1] __LF \ + lsl x3, x6, #1 __LF \ + adds x3, x3, x6 __LF \ + extr x4, x7, x6, #63 __LF \ + adcs x4, x4, x7 __LF \ + ldp x8, x9, [P1+16] __LF \ + extr x5, x8, x7, #63 __LF \ + adcs x5, x5, x8 __LF \ + extr x6, x9, x8, #63 __LF \ + adcs x6, x6, x9 __LF \ + ldp x10, x11, [P1+32] __LF \ + extr x7, x10, x9, #63 __LF \ + adcs x7, x7, x10 __LF \ + extr x8, x11, x10, #63 __LF \ + adcs x8, x8, x11 __LF \ + ldp x12, x13, [P1+48] __LF \ + extr x9, x12, x11, #63 __LF \ + adcs x9, x9, x12 __LF \ + extr x10, x13, x12, #63 __LF \ + adcs x10, x10, x13 __LF \ + ldr x14, [P1+64] __LF \ + extr x11, x14, x13, #63 __LF \ + adc x11, x11, x14 __LF \ + ldp x20, x21, [P2] __LF \ + mvn x20, x20 __LF \ + lsl x0, x20, #3 __LF \ + adds x3, x3, x0 __LF \ + mvn x21, x21 __LF \ + extr x0, x21, x20, #61 __LF \ + adcs x4, x4, x0 __LF \ + ldp x22, x23, [P2+16] __LF \ + mvn x22, x22 __LF \ + extr x0, x22, x21, #61 __LF \ + adcs x5, x5, x0 __LF \ + and x15, x4, x5 __LF \ + mvn x23, x23 __LF \ + extr x0, x23, x22, #61 __LF \ + adcs x6, x6, x0 __LF \ + and x15, x15, x6 __LF \ + ldp x20, x21, [P2+32] __LF \ + mvn x20, x20 __LF \ + extr x0, x20, x23, #61 __LF \ + adcs x7, x7, x0 __LF \ + and x15, x15, x7 __LF \ + mvn x21, x21 __LF \ + extr x0, x21, x20, #61 __LF \ + adcs x8, x8, x0 __LF \ + and x15, x15, x8 __LF \ + ldp x22, x23, [P2+48] __LF \ + mvn x22, x22 __LF \ + extr x0, x22, x21, #61 __LF \ + adcs x9, x9, x0 __LF \ + and x15, x15, x9 __LF \ + mvn x23, x23 __LF \ + extr x0, x23, x22, #61 __LF \ + adcs x10, x10, x0 __LF \ + and x15, x15, x10 __LF \ + ldr x0, [P2+64] __LF \ + eor x0, x0, #0x1ff __LF \ + extr x0, x0, x23, #61 __LF \ + adc x11, x11, x0 __LF \ + lsr x12, x11, #9 __LF \ + orr x11, x11, #0xfffffffffffffe00 __LF \ + cmp xzr, xzr __LF \ + adcs xzr, x3, x12 __LF \ + adcs xzr, x15, xzr __LF \ + adcs xzr, x11, xzr __LF \ + adcs x3, x3, x12 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adcs x6, x6, xzr __LF \ + adcs x7, x7, xzr __LF \ + adcs x8, x8, xzr __LF \ + adcs x9, x9, xzr __LF \ + adcs x10, x10, xzr __LF \ + adc x11, x11, xzr __LF \ + and x11, x11, #0x1ff __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] __LF \ + stp x7, x8, [P0+32] __LF \ + stp x9, x10, [P0+48] __LF \ + str x11, [P0+64] + +// P0 = 4 * P1 - P2 = 4 * P1 + (p_521 - P2) + +#define cmsub41_p521(P0,P1,P2) \ + ldp x6, x7, [P1] __LF \ + lsl x3, x6, #2 __LF \ + extr x4, x7, x6, #62 __LF \ + ldp x8, x9, [P1+16] __LF \ + extr x5, x8, x7, #62 __LF \ + extr x6, x9, x8, #62 __LF \ + ldp x10, x11, [P1+32] __LF \ + extr x7, x10, x9, #62 __LF \ + extr x8, x11, x10, #62 __LF \ + ldp x12, x13, [P1+48] __LF \ + extr x9, x12, x11, #62 __LF \ + extr x10, x13, x12, #62 __LF \ + ldr x14, [P1+64] __LF \ + extr x11, x14, x13, #62 __LF \ + ldp x0, x1, [P2] __LF \ + mvn x0, x0 __LF \ + adds x3, x3, x0 __LF \ + sbcs x4, x4, x1 __LF \ + ldp x0, x1, [P2+16] __LF \ + sbcs x5, x5, x0 __LF \ + and x15, x4, x5 __LF \ + sbcs x6, x6, x1 __LF \ + and x15, x15, x6 __LF \ + ldp x0, x1, [P2+32] __LF \ + sbcs x7, x7, x0 __LF \ + and x15, x15, x7 __LF \ + sbcs x8, x8, x1 __LF \ + and x15, x15, x8 __LF \ + ldp x0, x1, [P2+48] __LF \ + sbcs x9, x9, x0 __LF \ + and x15, x15, x9 __LF \ + sbcs x10, x10, x1 __LF \ + and x15, x15, x10 __LF \ + ldr x0, [P2+64] __LF \ + eor x0, x0, #0x1ff __LF \ + adc x11, x11, x0 __LF \ + lsr x12, x11, #9 __LF \ + orr x11, x11, #0xfffffffffffffe00 __LF \ + cmp xzr, xzr __LF \ + adcs xzr, x3, x12 __LF \ + adcs xzr, x15, xzr __LF \ + adcs xzr, x11, xzr __LF \ + adcs x3, x3, x12 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + adcs x6, x6, xzr __LF \ + adcs x7, x7, xzr __LF \ + adcs x8, x8, xzr __LF \ + adcs x9, x9, xzr __LF \ + adcs x10, x10, xzr __LF \ + adc x11, x11, xzr __LF \ + and x11, x11, #0x1ff __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] __LF \ + stp x7, x8, [P0+32] __LF \ + stp x9, x10, [P0+48] __LF \ + str x11, [P0+64] + +S2N_BN_SYMBOL(p521_jdouble_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + sqr_p521(z2,z_1) + sqr_p521(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + add_p521(t1,x_1,z2) + sub_p521(t2,x_1,z2) + mul_p521(x2p,t1,t2) + +// t1 = y + z +// x4p = x2p^2 +// xy2 = x * y^2 + + add_p521(t1,y_1,z_1) + sqr_p521(x4p,x2p) + weakmul_p521(xy2,x_1,y2) + +// t2 = (y + z)^2 + + sqr_p521(t2,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p521(d,12,xy2,9,x4p) + sub_p521(t1,t2,z2) + +// y4 = y^4 + + sqr_p521(y4,y2) + +// z_3' = 2 * y * z +// dx2 = d * x2p + + sub_p521(z_3,t1,y2) + weakmul_p521(dx2,d,x2p) + +// x' = 4 * xy2 - d + + cmsub41_p521(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p521(y_3,dx2,y4) + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x27, x28, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/p521_jmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jmixadd.S similarity index 98% rename from third_party/s2n-bignum/arm/p521/p521_jmixadd.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jmixadd.S index b04e39327fc..2ee6433b043 100644 --- a/third_party/s2n-bignum/arm/p521/p521_jmixadd.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jmixadd.S @@ -81,21 +81,21 @@ // and bignum_sub_p521 #define mul_p521(P0,P1,P2) \ - add x0, P0; \ - add x1, P1; \ - add x2, P2; \ - bl local_mul_p521 + add x0, P0 __LF \ + add x1, P1 __LF \ + add x2, P2 __LF \ + bl p521_jmixadd_local_mul_p521 #define sqr_p521(P0,P1) \ - add x0, P0; \ - add x1, P1; \ - bl local_sqr_p521 + add x0, P0 __LF \ + add x1, P1 __LF \ + bl p521_jmixadd_local_sqr_p521 #define sub_p521(P0,P1,P2) \ - add x0, P0; \ - add x1, P1; \ - add x2, P2; \ - bl local_sub_p521 + add x0, P0 __LF \ + add x1, P1 __LF \ + add x2, P2 __LF \ + bl p521_jmixadd_local_sub_p521 S2N_BN_SYMBOL(p521_jmixadd): @@ -258,7 +258,7 @@ S2N_BN_SYMBOL(p521_jmixadd): // local_mul_p521, using the tmp buffer as temporary storage and // avoiding x26. -local_mul_p521: +p521_jmixadd_local_mul_p521: ldp x3, x4, [x1] ldp x5, x6, [x1, #16] ldp x7, x8, [x2] @@ -885,7 +885,7 @@ local_mul_p521: str x22, [x0, #64] ret -local_sqr_p521: +p521_jmixadd_local_sqr_p521: ldp x2, x3, [x1] ldp x4, x5, [x1, #16] ldp x6, x7, [x1, #32] @@ -1300,7 +1300,7 @@ local_sqr_p521: str x10, [x0, #64] ret -local_sub_p521: +p521_jmixadd_local_sub_p521: ldp x5, x6, [x1] ldp x4, x3, [x2] subs x5, x5, x4 diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jmixadd_alt.S new file mode 100644 index 00000000000..006d8ddc9f4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jmixadd_alt.S @@ -0,0 +1,882 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-521 in Jacobian coordinates +// +// extern void p521_jmixadd_alt +// (uint64_t p3[static 27],uint64_t p1[static 27],uint64_t p2[static 18]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. It is assumed that +// all the coordinates of the input points p1 and p2 are fully reduced +// mod p_521, that the z coordinate of p1 is nonzero and that neither +// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine +// point as". +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jmixadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 + +// Stable homes for input arguments during main code sequence + +#define input_z x26 +#define input_x x27 +#define input_y x28 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_mul_p521_alt + +#define mul_p521(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + mul x15, x3, x5 __LF \ + umulh x16, x3, x5 __LF \ + mul x14, x3, x6 __LF \ + umulh x17, x3, x6 __LF \ + adds x16, x16, x14 __LF \ + ldp x7, x8, [P2+16] __LF \ + mul x14, x3, x7 __LF \ + umulh x19, x3, x7 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x3, x8 __LF \ + umulh x20, x3, x8 __LF \ + adcs x19, x19, x14 __LF \ + ldp x9, x10, [P2+32] __LF \ + mul x14, x3, x9 __LF \ + umulh x21, x3, x9 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x3, x10 __LF \ + umulh x22, x3, x10 __LF \ + adcs x21, x21, x14 __LF \ + ldp x11, x12, [P2+48] __LF \ + mul x14, x3, x11 __LF \ + umulh x23, x3, x11 __LF \ + adcs x22, x22, x14 __LF \ + ldr x13, [P2+64] __LF \ + mul x14, x3, x12 __LF \ + umulh x24, x3, x12 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x13 __LF \ + umulh x1, x3, x13 __LF \ + adcs x24, x24, x14 __LF \ + adc x1, x1, xzr __LF \ + mul x14, x4, x5 __LF \ + adds x16, x16, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x1, x1, x14 __LF \ + cset x0, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x17, x17, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x21, x21, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x0, x0, x14 __LF \ + stp x15, x16, [P0] __LF \ + ldp x3, x4, [P1+16] __LF \ + mul x14, x3, x5 __LF \ + adds x17, x17, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x13 __LF \ + adcs x0, x0, x14 __LF \ + cset x15, hs __LF \ + umulh x14, x3, x5 __LF \ + adds x19, x19, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x21, x21, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x3, x12 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x13 __LF \ + adc x15, x15, x14 __LF \ + mul x14, x4, x5 __LF \ + adds x19, x19, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x15, x15, x14 __LF \ + cset x16, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x20, x20, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x21, x21, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x16, x16, x14 __LF \ + stp x17, x19, [P0+16] __LF \ + ldp x3, x4, [P1+32] __LF \ + mul x14, x3, x5 __LF \ + adds x20, x20, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x21, x21, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x3, x13 __LF \ + adcs x16, x16, x14 __LF \ + cset x17, hs __LF \ + umulh x14, x3, x5 __LF \ + adds x21, x21, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x22, x22, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x3, x12 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x3, x13 __LF \ + adc x17, x17, x14 __LF \ + mul x14, x4, x5 __LF \ + adds x21, x21, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x22, x22, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x17, x17, x14 __LF \ + cset x19, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x22, x22, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x23, x23, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x19, x19, x14 __LF \ + stp x20, x21, [P0+32] __LF \ + ldp x3, x4, [P1+48] __LF \ + mul x14, x3, x5 __LF \ + adds x22, x22, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x23, x23, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x3, x13 __LF \ + adcs x19, x19, x14 __LF \ + cset x20, hs __LF \ + umulh x14, x3, x5 __LF \ + adds x23, x23, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x24, x24, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x3, x12 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x3, x13 __LF \ + adc x20, x20, x14 __LF \ + mul x14, x4, x5 __LF \ + adds x23, x23, x14 __LF \ + mul x14, x4, x6 __LF \ + adcs x24, x24, x14 __LF \ + mul x14, x4, x7 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x4, x8 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x4, x9 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x4, x10 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x4, x11 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x4, x12 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x4, x13 __LF \ + adcs x20, x20, x14 __LF \ + cset x21, hs __LF \ + umulh x14, x4, x5 __LF \ + adds x24, x24, x14 __LF \ + umulh x14, x4, x6 __LF \ + adcs x1, x1, x14 __LF \ + umulh x14, x4, x7 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x4, x8 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x4, x9 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x4, x10 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x4, x11 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x4, x12 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x4, x13 __LF \ + adc x21, x21, x14 __LF \ + stp x22, x23, [P0+48] __LF \ + ldr x3, [P1+64] __LF \ + mul x14, x3, x5 __LF \ + adds x24, x24, x14 __LF \ + mul x14, x3, x6 __LF \ + adcs x1, x1, x14 __LF \ + mul x14, x3, x7 __LF \ + adcs x0, x0, x14 __LF \ + mul x14, x3, x8 __LF \ + adcs x15, x15, x14 __LF \ + mul x14, x3, x9 __LF \ + adcs x16, x16, x14 __LF \ + mul x14, x3, x10 __LF \ + adcs x17, x17, x14 __LF \ + mul x14, x3, x11 __LF \ + adcs x19, x19, x14 __LF \ + mul x14, x3, x12 __LF \ + adcs x20, x20, x14 __LF \ + mul x14, x3, x13 __LF \ + adc x21, x21, x14 __LF \ + umulh x14, x3, x5 __LF \ + adds x1, x1, x14 __LF \ + umulh x14, x3, x6 __LF \ + adcs x0, x0, x14 __LF \ + umulh x14, x3, x7 __LF \ + adcs x15, x15, x14 __LF \ + umulh x14, x3, x8 __LF \ + adcs x16, x16, x14 __LF \ + umulh x14, x3, x9 __LF \ + adcs x17, x17, x14 __LF \ + umulh x14, x3, x10 __LF \ + adcs x19, x19, x14 __LF \ + umulh x14, x3, x11 __LF \ + adcs x20, x20, x14 __LF \ + umulh x14, x3, x12 __LF \ + adc x21, x21, x14 __LF \ + cmp xzr, xzr __LF \ + ldp x5, x6, [P0] __LF \ + extr x14, x1, x24, #9 __LF \ + adcs x5, x5, x14 __LF \ + extr x14, x0, x1, #9 __LF \ + adcs x6, x6, x14 __LF \ + ldp x7, x8, [P0+16] __LF \ + extr x14, x15, x0, #9 __LF \ + adcs x7, x7, x14 __LF \ + extr x14, x16, x15, #9 __LF \ + adcs x8, x8, x14 __LF \ + ldp x9, x10, [P0+32] __LF \ + extr x14, x17, x16, #9 __LF \ + adcs x9, x9, x14 __LF \ + extr x14, x19, x17, #9 __LF \ + adcs x10, x10, x14 __LF \ + ldp x11, x12, [P0+48] __LF \ + extr x14, x20, x19, #9 __LF \ + adcs x11, x11, x14 __LF \ + extr x14, x21, x20, #9 __LF \ + adcs x12, x12, x14 __LF \ + orr x13, x24, #0xfffffffffffffe00 __LF \ + lsr x14, x21, #9 __LF \ + adcs x13, x13, x14 __LF \ + sbcs x5, x5, xzr __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbc x13, x13, xzr __LF \ + and x13, x13, #0x1ff __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] __LF \ + stp x11, x12, [P0+48] __LF \ + str x13, [P0+64] + +// Corresponds exactly to bignum_sqr_p521_alt + +#define sqr_p521(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x11, x2, x3 __LF \ + umulh x12, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x10, x2, x4 __LF \ + umulh x13, x2, x4 __LF \ + adds x12, x12, x10 __LF \ + ldp x6, x7, [P1+32] __LF \ + mul x10, x2, x5 __LF \ + umulh x14, x2, x5 __LF \ + adcs x13, x13, x10 __LF \ + ldp x8, x9, [P1+48] __LF \ + mul x10, x2, x6 __LF \ + umulh x15, x2, x6 __LF \ + adcs x14, x14, x10 __LF \ + mul x10, x2, x7 __LF \ + umulh x16, x2, x7 __LF \ + adcs x15, x15, x10 __LF \ + mul x10, x2, x8 __LF \ + umulh x17, x2, x8 __LF \ + adcs x16, x16, x10 __LF \ + mul x10, x2, x9 __LF \ + umulh x19, x2, x9 __LF \ + adcs x17, x17, x10 __LF \ + adc x19, x19, xzr __LF \ + mul x10, x3, x4 __LF \ + adds x13, x13, x10 __LF \ + mul x10, x3, x5 __LF \ + adcs x14, x14, x10 __LF \ + mul x10, x3, x6 __LF \ + adcs x15, x15, x10 __LF \ + mul x10, x3, x7 __LF \ + adcs x16, x16, x10 __LF \ + mul x10, x3, x8 __LF \ + adcs x17, x17, x10 __LF \ + mul x10, x3, x9 __LF \ + adcs x19, x19, x10 __LF \ + cset x20, hs __LF \ + umulh x10, x3, x4 __LF \ + adds x14, x14, x10 __LF \ + umulh x10, x3, x5 __LF \ + adcs x15, x15, x10 __LF \ + umulh x10, x3, x6 __LF \ + adcs x16, x16, x10 __LF \ + umulh x10, x3, x7 __LF \ + adcs x17, x17, x10 __LF \ + umulh x10, x3, x8 __LF \ + adcs x19, x19, x10 __LF \ + umulh x10, x3, x9 __LF \ + adc x20, x20, x10 __LF \ + mul x10, x6, x7 __LF \ + umulh x21, x6, x7 __LF \ + adds x20, x20, x10 __LF \ + adc x21, x21, xzr __LF \ + mul x10, x4, x5 __LF \ + adds x15, x15, x10 __LF \ + mul x10, x4, x6 __LF \ + adcs x16, x16, x10 __LF \ + mul x10, x4, x7 __LF \ + adcs x17, x17, x10 __LF \ + mul x10, x4, x8 __LF \ + adcs x19, x19, x10 __LF \ + mul x10, x4, x9 __LF \ + adcs x20, x20, x10 __LF \ + mul x10, x6, x8 __LF \ + adcs x21, x21, x10 __LF \ + cset x22, hs __LF \ + umulh x10, x4, x5 __LF \ + adds x16, x16, x10 __LF \ + umulh x10, x4, x6 __LF \ + adcs x17, x17, x10 __LF \ + umulh x10, x4, x7 __LF \ + adcs x19, x19, x10 __LF \ + umulh x10, x4, x8 __LF \ + adcs x20, x20, x10 __LF \ + umulh x10, x4, x9 __LF \ + adcs x21, x21, x10 __LF \ + umulh x10, x6, x8 __LF \ + adc x22, x22, x10 __LF \ + mul x10, x7, x8 __LF \ + umulh x23, x7, x8 __LF \ + adds x22, x22, x10 __LF \ + adc x23, x23, xzr __LF \ + mul x10, x5, x6 __LF \ + adds x17, x17, x10 __LF \ + mul x10, x5, x7 __LF \ + adcs x19, x19, x10 __LF \ + mul x10, x5, x8 __LF \ + adcs x20, x20, x10 __LF \ + mul x10, x5, x9 __LF \ + adcs x21, x21, x10 __LF \ + mul x10, x6, x9 __LF \ + adcs x22, x22, x10 __LF \ + mul x10, x7, x9 __LF \ + adcs x23, x23, x10 __LF \ + cset x24, hs __LF \ + umulh x10, x5, x6 __LF \ + adds x19, x19, x10 __LF \ + umulh x10, x5, x7 __LF \ + adcs x20, x20, x10 __LF \ + umulh x10, x5, x8 __LF \ + adcs x21, x21, x10 __LF \ + umulh x10, x5, x9 __LF \ + adcs x22, x22, x10 __LF \ + umulh x10, x6, x9 __LF \ + adcs x23, x23, x10 __LF \ + umulh x10, x7, x9 __LF \ + adc x24, x24, x10 __LF \ + mul x10, x8, x9 __LF \ + umulh x25, x8, x9 __LF \ + adds x24, x24, x10 __LF \ + adc x25, x25, xzr __LF \ + adds x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + adcs x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adcs x17, x17, x17 __LF \ + adcs x19, x19, x19 __LF \ + adcs x20, x20, x20 __LF \ + adcs x21, x21, x21 __LF \ + adcs x22, x22, x22 __LF \ + adcs x23, x23, x23 __LF \ + adcs x24, x24, x24 __LF \ + adcs x25, x25, x25 __LF \ + cset x0, hs __LF \ + umulh x10, x2, x2 __LF \ + adds x11, x11, x10 __LF \ + mul x10, x3, x3 __LF \ + adcs x12, x12, x10 __LF \ + umulh x10, x3, x3 __LF \ + adcs x13, x13, x10 __LF \ + mul x10, x4, x4 __LF \ + adcs x14, x14, x10 __LF \ + umulh x10, x4, x4 __LF \ + adcs x15, x15, x10 __LF \ + mul x10, x5, x5 __LF \ + adcs x16, x16, x10 __LF \ + umulh x10, x5, x5 __LF \ + adcs x17, x17, x10 __LF \ + mul x10, x6, x6 __LF \ + adcs x19, x19, x10 __LF \ + umulh x10, x6, x6 __LF \ + adcs x20, x20, x10 __LF \ + mul x10, x7, x7 __LF \ + adcs x21, x21, x10 __LF \ + umulh x10, x7, x7 __LF \ + adcs x22, x22, x10 __LF \ + mul x10, x8, x8 __LF \ + adcs x23, x23, x10 __LF \ + umulh x10, x8, x8 __LF \ + adcs x24, x24, x10 __LF \ + mul x10, x9, x9 __LF \ + adcs x25, x25, x10 __LF \ + umulh x10, x9, x9 __LF \ + adc x0, x0, x10 __LF \ + ldr x1, [P1+64] __LF \ + add x1, x1, x1 __LF \ + mul x10, x1, x2 __LF \ + adds x19, x19, x10 __LF \ + umulh x10, x1, x2 __LF \ + adcs x20, x20, x10 __LF \ + mul x10, x1, x4 __LF \ + adcs x21, x21, x10 __LF \ + umulh x10, x1, x4 __LF \ + adcs x22, x22, x10 __LF \ + mul x10, x1, x6 __LF \ + adcs x23, x23, x10 __LF \ + umulh x10, x1, x6 __LF \ + adcs x24, x24, x10 __LF \ + mul x10, x1, x8 __LF \ + adcs x25, x25, x10 __LF \ + umulh x10, x1, x8 __LF \ + adcs x0, x0, x10 __LF \ + lsr x4, x1, #1 __LF \ + mul x4, x4, x4 __LF \ + adc x4, x4, xzr __LF \ + mul x10, x1, x3 __LF \ + adds x20, x20, x10 __LF \ + umulh x10, x1, x3 __LF \ + adcs x21, x21, x10 __LF \ + mul x10, x1, x5 __LF \ + adcs x22, x22, x10 __LF \ + umulh x10, x1, x5 __LF \ + adcs x23, x23, x10 __LF \ + mul x10, x1, x7 __LF \ + adcs x24, x24, x10 __LF \ + umulh x10, x1, x7 __LF \ + adcs x25, x25, x10 __LF \ + mul x10, x1, x9 __LF \ + adcs x0, x0, x10 __LF \ + umulh x10, x1, x9 __LF \ + adc x4, x4, x10 __LF \ + mul x2, x2, x2 __LF \ + cmp xzr, xzr __LF \ + extr x10, x20, x19, #9 __LF \ + adcs x2, x2, x10 __LF \ + extr x10, x21, x20, #9 __LF \ + adcs x11, x11, x10 __LF \ + extr x10, x22, x21, #9 __LF \ + adcs x12, x12, x10 __LF \ + extr x10, x23, x22, #9 __LF \ + adcs x13, x13, x10 __LF \ + extr x10, x24, x23, #9 __LF \ + adcs x14, x14, x10 __LF \ + extr x10, x25, x24, #9 __LF \ + adcs x15, x15, x10 __LF \ + extr x10, x0, x25, #9 __LF \ + adcs x16, x16, x10 __LF \ + extr x10, x4, x0, #9 __LF \ + adcs x17, x17, x10 __LF \ + orr x19, x19, #0xfffffffffffffe00 __LF \ + lsr x10, x4, #9 __LF \ + adcs x19, x19, x10 __LF \ + sbcs x2, x2, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbcs x15, x15, xzr __LF \ + sbcs x16, x16, xzr __LF \ + sbcs x17, x17, xzr __LF \ + sbc x19, x19, xzr __LF \ + and x19, x19, #0x1ff __LF \ + stp x2, x11, [P0] __LF \ + stp x12, x13, [P0+16] __LF \ + stp x14, x15, [P0+32] __LF \ + stp x16, x17, [P0+48] __LF \ + str x19, [P0+64] + +// Corresponds exactly to bignum_sub_p521 + +#define sub_p521(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + ldp x9, x10, [P1+32] __LF \ + ldp x4, x3, [P2+32] __LF \ + sbcs x9, x9, x4 __LF \ + sbcs x10, x10, x3 __LF \ + ldp x11, x12, [P1+48] __LF \ + ldp x4, x3, [P2+48] __LF \ + sbcs x11, x11, x4 __LF \ + sbcs x12, x12, x3 __LF \ + ldr x13, [P1+64] __LF \ + ldr x4, [P2+64] __LF \ + sbcs x13, x13, x4 __LF \ + sbcs x5, x5, xzr __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbcs x11, x11, xzr __LF \ + sbcs x12, x12, xzr __LF \ + sbcs x13, x13, xzr __LF \ + and x13, x13, #0x1ff __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] __LF \ + stp x9, x10, [P0+32] __LF \ + stp x11, x12, [P0+48] __LF \ + str x13, [P0+64] + +S2N_BN_SYMBOL(p521_jmixadd_alt): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations + + sqr_p521(zp2,z_1) + mul_p521(y2a,z_1,y_2) + + mul_p521(x2a,zp2,x_2) + mul_p521(y2a,zp2,y2a) + + sub_p521(xd,x2a,x_1) + sub_p521(yd,y2a,y_1) + + sqr_p521(zz,xd) + sqr_p521(ww,yd) + + mul_p521(zzx1,zz,x_1) + mul_p521(zzx2,zz,x2a) + + sub_p521(resx,ww,zzx1) + sub_p521(t1,zzx2,zzx1) + + mul_p521(resz,xd,z_1) + + sub_p521(resx,resx,zzx2) + + sub_p521(t2,zzx1,resx) + + mul_p521(t1,t1,y_1) + mul_p521(t2,yd,t2) + + sub_p521(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + orr x0, x0, x1 + ldp x2, x3, [z_1+16] + orr x2, x2, x3 + ldp x4, x5, [z_1+32] + orr x4, x4, x5 + ldp x6, x7, [z_1+48] + orr x6, x6, x7 + ldr x8, [z_1+64] + orr x0, x0, x2 + orr x4, x4, x6 + orr x0, x0, x4 + orr x0, x0, x8 + cmp x0, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x20, x21, [x_2] + csel x0, x0, x20, ne + csel x1, x1, x21, ne + ldp x2, x3, [resx+16] + ldp x20, x21, [x_2+16] + csel x2, x2, x20, ne + csel x3, x3, x21, ne + ldp x4, x5, [resx+32] + ldp x20, x21, [x_2+32] + csel x4, x4, x20, ne + csel x5, x5, x21, ne + ldp x6, x7, [resx+48] + ldp x20, x21, [x_2+48] + csel x6, x6, x20, ne + csel x7, x7, x21, ne + ldr x8, [resx+64] + ldr x20, [x_2+64] + csel x8, x8, x20, ne + + ldp x10, x11, [resy] + ldp x20, x21, [y_2] + csel x10, x10, x20, ne + csel x11, x11, x21, ne + ldp x12, x13, [resy+16] + ldp x20, x21, [y_2+16] + csel x12, x12, x20, ne + csel x13, x13, x21, ne + ldp x14, x15, [resy+32] + ldp x20, x21, [y_2+32] + csel x14, x14, x20, ne + csel x15, x15, x21, ne + ldp x16, x17, [resy+48] + ldp x20, x21, [y_2+48] + csel x16, x16, x20, ne + csel x17, x17, x21, ne + ldr x19, [resy+64] + ldr x20, [y_2+64] + csel x19, x19, x20, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [x_3+32] + stp x6, x7, [x_3+48] + str x8, [x_3+64] + stp x10, x11, [y_3] + stp x12, x13, [y_3+16] + stp x14, x15, [y_3+32] + stp x16, x17, [y_3+48] + str x19, [y_3+64] + + ldp x0, x1, [resz] + mov x20, #1 + csel x0, x0, x20, ne + csel x1, x1, xzr, ne + ldp x2, x3, [resz+16] + csel x2, x2, xzr, ne + csel x3, x3, xzr, ne + ldp x4, x5, [resz+32] + csel x4, x4, xzr, ne + csel x5, x5, xzr, ne + ldp x6, x7, [resz+48] + csel x6, x6, xzr, ne + csel x7, x7, xzr, ne + ldr x8, [resz+64] + csel x8, x8, xzr, ne + + stp x0, x1, [z_3] + stp x2, x3, [z_3+16] + stp x4, x5, [z_3+32] + stp x6, x7, [z_3+48] + str x8, [z_3+64] + +// Restore stack and registers + + add sp, sp, NSPACE + + ldp x27, x28, [sp], 16 + ldp x25, x26, [sp], 16 + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/p521_jscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jscalarmul.S similarity index 98% rename from third_party/s2n-bignum/arm/p521/p521_jscalarmul.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jscalarmul.S index 37cc9231302..3c26f7fe97f 100644 --- a/third_party/s2n-bignum/arm/p521/p521_jscalarmul.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jscalarmul.S @@ -59,29 +59,29 @@ #define NSPACE #(55*NUMSIZE+8) #define selectblock(I) \ - cmp bf, #(1*I); \ - ldp x10, x11, [tabup]; \ - csel x0, x10, x0, eq; \ - csel x1, x11, x1, eq; \ - ldp x10, x11, [tabup, #16]; \ - csel x2, x10, x2, eq; \ - csel x3, x11, x3, eq; \ - ldp x10, x11, [tabup, #32]; \ - csel x4, x10, x4, eq; \ - csel x5, x11, x5, eq; \ - ldp x10, x11, [tabup, #48]; \ - csel x6, x10, x6, eq; \ - csel x7, x11, x7, eq; \ - ldr x10, [tabup, #64]; \ - csel x8, x10, x8, eq; \ + cmp bf, #(1*I) __LF \ + ldp x10, x11, [tabup] __LF \ + csel x0, x10, x0, eq __LF \ + csel x1, x11, x1, eq __LF \ + ldp x10, x11, [tabup, #16] __LF \ + csel x2, x10, x2, eq __LF \ + csel x3, x11, x3, eq __LF \ + ldp x10, x11, [tabup, #32] __LF \ + csel x4, x10, x4, eq __LF \ + csel x5, x11, x5, eq __LF \ + ldp x10, x11, [tabup, #48] __LF \ + csel x6, x10, x6, eq __LF \ + csel x7, x11, x7, eq __LF \ + ldr x10, [tabup, #64] __LF \ + csel x8, x10, x8, eq __LF \ add tabup, tabup, #JACSIZE // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 S2N_BN_SYMBOL(p521_jscalarmul): diff --git a/third_party/s2n-bignum/arm/p521/p521_jscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jscalarmul_alt.S similarity index 98% rename from third_party/s2n-bignum/arm/p521/p521_jscalarmul_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jscalarmul_alt.S index 89e0408d8b4..1f64a9b176f 100644 --- a/third_party/s2n-bignum/arm/p521/p521_jscalarmul_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/p521_jscalarmul_alt.S @@ -59,29 +59,29 @@ #define NSPACE #(55*NUMSIZE+8) #define selectblock(I) \ - cmp bf, #(1*I); \ - ldp x10, x11, [tabup]; \ - csel x0, x10, x0, eq; \ - csel x1, x11, x1, eq; \ - ldp x10, x11, [tabup, #16]; \ - csel x2, x10, x2, eq; \ - csel x3, x11, x3, eq; \ - ldp x10, x11, [tabup, #32]; \ - csel x4, x10, x4, eq; \ - csel x5, x11, x5, eq; \ - ldp x10, x11, [tabup, #48]; \ - csel x6, x10, x6, eq; \ - csel x7, x11, x7, eq; \ - ldr x10, [tabup, #64]; \ - csel x8, x10, x8, eq; \ + cmp bf, #(1*I) __LF \ + ldp x10, x11, [tabup] __LF \ + csel x0, x10, x0, eq __LF \ + csel x1, x11, x1, eq __LF \ + ldp x10, x11, [tabup, #16] __LF \ + csel x2, x10, x2, eq __LF \ + csel x3, x11, x3, eq __LF \ + ldp x10, x11, [tabup, #32] __LF \ + csel x4, x10, x4, eq __LF \ + csel x5, x11, x5, eq __LF \ + ldp x10, x11, [tabup, #48] __LF \ + csel x6, x10, x6, eq __LF \ + csel x7, x11, x7, eq __LF \ + ldr x10, [tabup, #64] __LF \ + csel x8, x10, x8, eq __LF \ add tabup, tabup, #JACSIZE // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ - movz nn, n0; \ - movk nn, n1, lsl #16; \ - movk nn, n2, lsl #32; \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ movk nn, n3, lsl #48 S2N_BN_SYMBOL(p521_jscalarmul_alt): diff --git a/third_party/s2n-bignum/arm/p521/bignum_montmul_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_montmul_p521_base.S similarity index 78% rename from third_party/s2n-bignum/arm/p521/bignum_montmul_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_montmul_p521_base.S index e1ea8dc0c22..81ba5660a1c 100644 --- a/third_party/s2n-bignum/arm/p521/bignum_montmul_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_montmul_p521_base.S @@ -5,20 +5,20 @@ // Montgomery multiply, z := (x * y / 2^576) mod p_521 // Inputs x[9], y[9]; output z[9] // -// extern void bignum_montmul_p521 +// extern void bignum_montmul_p521_base // (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); // // Does z := (x * y / 2^576) mod p_521, assuming x < p_521, y < p_521. This // means the Montgomery base is the "native size" 2^{9*64} = 2^576; since -// p_521 is a Mersenne prime the basic modular multiplication bignum_mul_p521 +// p_521 is a Mersenne prime the basic modular multiplication bignum_mul_p521_base // can be considered a Montgomery operation to base 2^521. // // Standard ARM ABI: X0 = z, X1 = x, X2 = y // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p521) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p521) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p521_base) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p521_base) .text .balign 4 @@ -33,18 +33,18 @@ // --------------------------------------------------------------------------- #define muldiffnadd(b,a,x,y,w,z) \ - subs t, x, y; \ - cneg t, t, cc; \ - csetm c, cc; \ - subs h, w, z; \ - cneg h, h, cc; \ - mul l, t, h; \ - umulh h, t, h; \ - cinv c, c, cc; \ - adds xzr, c, #1; \ - eor l, l, c; \ - adcs a, a, l; \ - eor h, h, c; \ + subs t, x, y __LF \ + cneg t, t, cc __LF \ + csetm c, cc __LF \ + subs h, w, z __LF \ + cneg h, h, cc __LF \ + mul l, t, h __LF \ + umulh h, t, h __LF \ + cinv c, c, cc __LF \ + adds xzr, c, #1 __LF \ + eor l, l, c __LF \ + adcs a, a, l __LF \ + eor h, h, c __LF \ adcs b, b, h #define z x0 @@ -85,69 +85,69 @@ #define mul4 \ /* First accumulate all the "simple" products as [s7,s6,s5,s4,s0] */ \ \ - mul s0, a0, b0; \ - mul s4, a1, b1; \ - mul s5, a2, b2; \ - mul s6, a3, b3; \ + mul s0, a0, b0 __LF \ + mul s4, a1, b1 __LF \ + mul s5, a2, b2 __LF \ + mul s6, a3, b3 __LF \ \ - umulh s7, a0, b0; \ - adds s4, s4, s7; \ - umulh s7, a1, b1; \ - adcs s5, s5, s7; \ - umulh s7, a2, b2; \ - adcs s6, s6, s7; \ - umulh s7, a3, b3; \ - adc s7, s7, xzr; \ + umulh s7, a0, b0 __LF \ + adds s4, s4, s7 __LF \ + umulh s7, a1, b1 __LF \ + adcs s5, s5, s7 __LF \ + umulh s7, a2, b2 __LF \ + adcs s6, s6, s7 __LF \ + umulh s7, a3, b3 __LF \ + adc s7, s7, xzr __LF \ \ /* Multiply by B + 1 to get [s7;s6;s5;s4;s1;s0] */ \ \ - adds s1, s4, s0; \ - adcs s4, s5, s4; \ - adcs s5, s6, s5; \ - adcs s6, s7, s6; \ - adc s7, xzr, s7; \ + adds s1, s4, s0 __LF \ + adcs s4, s5, s4 __LF \ + adcs s5, s6, s5 __LF \ + adcs s6, s7, s6 __LF \ + adc s7, xzr, s7 __LF \ \ /* Multiply by B^2 + 1 to get [s7;s6;s5;s4;s3;s2;s1;s0] */ \ \ - adds s2, s4, s0; \ - adcs s3, s5, s1; \ - adcs s4, s6, s4; \ - adcs s5, s7, s5; \ - adcs s6, xzr, s6; \ - adc s7, xzr, s7; \ + adds s2, s4, s0 __LF \ + adcs s3, s5, s1 __LF \ + adcs s4, s6, s4 __LF \ + adcs s5, s7, s5 __LF \ + adcs s6, xzr, s6 __LF \ + adc s7, xzr, s7 __LF \ \ /* Now add in all the "complicated" terms. */ \ \ - muldiffnadd(s6,s5, a2,a3, b3,b2); \ - adc s7, s7, c; \ + muldiffnadd(s6,s5, a2,a3, b3,b2) __LF \ + adc s7, s7, c __LF \ \ - muldiffnadd(s2,s1, a0,a1, b1,b0); \ - adcs s3, s3, c; \ - adcs s4, s4, c; \ - adcs s5, s5, c; \ - adcs s6, s6, c; \ - adc s7, s7, c; \ + muldiffnadd(s2,s1, a0,a1, b1,b0) __LF \ + adcs s3, s3, c __LF \ + adcs s4, s4, c __LF \ + adcs s5, s5, c __LF \ + adcs s6, s6, c __LF \ + adc s7, s7, c __LF \ \ - muldiffnadd(s5,s4, a1,a3, b3,b1); \ - adcs s6, s6, c; \ - adc s7, s7, c; \ + muldiffnadd(s5,s4, a1,a3, b3,b1) __LF \ + adcs s6, s6, c __LF \ + adc s7, s7, c __LF \ \ - muldiffnadd(s3,s2, a0,a2, b2,b0); \ - adcs s4, s4, c; \ - adcs s5, s5, c; \ - adcs s6, s6, c; \ - adc s7, s7, c; \ + muldiffnadd(s3,s2, a0,a2, b2,b0) __LF \ + adcs s4, s4, c __LF \ + adcs s5, s5, c __LF \ + adcs s6, s6, c __LF \ + adc s7, s7, c __LF \ \ - muldiffnadd(s4,s3, a0,a3, b3,b0); \ - adcs s5, s5, c; \ - adcs s6, s6, c; \ - adc s7, s7, c; \ - muldiffnadd(s4,s3, a1,a2, b2,b1); \ - adcs s5, s5, c; \ - adcs s6, s6, c; \ + muldiffnadd(s4,s3, a0,a3, b3,b0) __LF \ + adcs s5, s5, c __LF \ + adcs s6, s6, c __LF \ + adc s7, s7, c __LF \ + muldiffnadd(s4,s3, a1,a2, b2,b1) __LF \ + adcs s5, s5, c __LF \ + adcs s6, s6, c __LF \ adc s7, s7, c \ -S2N_BN_SYMBOL(bignum_montmul_p521): +S2N_BN_SYMBOL(bignum_montmul_p521_base): // Save registers and make space for the temporary buffer diff --git a/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_montsqr_p521_base.S similarity index 98% rename from third_party/s2n-bignum/arm/p521/bignum_montsqr_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_montsqr_p521_base.S index 2c8dbd789f8..9ee30e07f3c 100644 --- a/third_party/s2n-bignum/arm/p521/bignum_montsqr_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_montsqr_p521_base.S @@ -5,20 +5,20 @@ // Montgomery square, z := (x^2 / 2^576) mod p_521 // Input x[9]; output z[9] // -// extern void bignum_montsqr_p521 +// extern void bignum_montsqr_p521_base // (uint64_t z[static 9], uint64_t x[static 9]); // // Does z := (x^2 / 2^576) mod p_521, assuming x < p_521. This means the // Montgomery base is the "native size" 2^{9*64} = 2^576; since p_521 is -// a Mersenne prime the basic modular squaring bignum_sqr_p521 can be +// a Mersenne prime the basic modular squaring bignum_sqr_p521_base can be // considered a Montgomery operation to base 2^521. // // Standard ARM ABI: X0 = z, X1 = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p521) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p521) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p521_base) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p521_base) .text .balign 4 @@ -62,7 +62,7 @@ #define d7 x9 #define d8 x10 -S2N_BN_SYMBOL(bignum_montsqr_p521): +S2N_BN_SYMBOL(bignum_montsqr_p521_base): // Save registers diff --git a/third_party/s2n-bignum/arm/p521/bignum_mul_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_mul_p521_base.S similarity index 78% rename from third_party/s2n-bignum/arm/p521/bignum_mul_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_mul_p521_base.S index 97859d6bbec..2c583c17ec3 100644 --- a/third_party/s2n-bignum/arm/p521/bignum_mul_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_mul_p521_base.S @@ -5,15 +5,15 @@ // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced // Inputs x[9], y[9]; output z[9] // -// extern void bignum_mul_p521 +// extern void bignum_mul_p521_base // (uint64_t z[static 9], uint64_t x[static 9], uint64_t y[static 9]); // // Standard ARM ABI: X0 = z, X1 = x, X2 = y // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p521) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p521) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p521_base) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p521_base) .text .balign 4 @@ -28,18 +28,18 @@ // --------------------------------------------------------------------------- #define muldiffnadd(b,a,x,y,w,z) \ - subs t, x, y; \ - cneg t, t, cc; \ - csetm c, cc; \ - subs h, w, z; \ - cneg h, h, cc; \ - mul l, t, h; \ - umulh h, t, h; \ - cinv c, c, cc; \ - adds xzr, c, #1; \ - eor l, l, c; \ - adcs a, a, l; \ - eor h, h, c; \ + subs t, x, y __LF \ + cneg t, t, cc __LF \ + csetm c, cc __LF \ + subs h, w, z __LF \ + cneg h, h, cc __LF \ + mul l, t, h __LF \ + umulh h, t, h __LF \ + cinv c, c, cc __LF \ + adds xzr, c, #1 __LF \ + eor l, l, c __LF \ + adcs a, a, l __LF \ + eor h, h, c __LF \ adcs b, b, h #define z x0 @@ -80,69 +80,69 @@ #define mul4 \ /* First accumulate all the "simple" products as [s7,s6,s5,s4,s0] */ \ \ - mul s0, a0, b0; \ - mul s4, a1, b1; \ - mul s5, a2, b2; \ - mul s6, a3, b3; \ + mul s0, a0, b0 __LF \ + mul s4, a1, b1 __LF \ + mul s5, a2, b2 __LF \ + mul s6, a3, b3 __LF \ \ - umulh s7, a0, b0; \ - adds s4, s4, s7; \ - umulh s7, a1, b1; \ - adcs s5, s5, s7; \ - umulh s7, a2, b2; \ - adcs s6, s6, s7; \ - umulh s7, a3, b3; \ - adc s7, s7, xzr; \ + umulh s7, a0, b0 __LF \ + adds s4, s4, s7 __LF \ + umulh s7, a1, b1 __LF \ + adcs s5, s5, s7 __LF \ + umulh s7, a2, b2 __LF \ + adcs s6, s6, s7 __LF \ + umulh s7, a3, b3 __LF \ + adc s7, s7, xzr __LF \ \ /* Multiply by B + 1 to get [s7;s6;s5;s4;s1;s0] */ \ \ - adds s1, s4, s0; \ - adcs s4, s5, s4; \ - adcs s5, s6, s5; \ - adcs s6, s7, s6; \ - adc s7, xzr, s7; \ + adds s1, s4, s0 __LF \ + adcs s4, s5, s4 __LF \ + adcs s5, s6, s5 __LF \ + adcs s6, s7, s6 __LF \ + adc s7, xzr, s7 __LF \ \ /* Multiply by B^2 + 1 to get [s7;s6;s5;s4;s3;s2;s1;s0] */ \ \ - adds s2, s4, s0; \ - adcs s3, s5, s1; \ - adcs s4, s6, s4; \ - adcs s5, s7, s5; \ - adcs s6, xzr, s6; \ - adc s7, xzr, s7; \ + adds s2, s4, s0 __LF \ + adcs s3, s5, s1 __LF \ + adcs s4, s6, s4 __LF \ + adcs s5, s7, s5 __LF \ + adcs s6, xzr, s6 __LF \ + adc s7, xzr, s7 __LF \ \ /* Now add in all the "complicated" terms. */ \ \ - muldiffnadd(s6,s5, a2,a3, b3,b2); \ - adc s7, s7, c; \ + muldiffnadd(s6,s5, a2,a3, b3,b2) __LF \ + adc s7, s7, c __LF \ \ - muldiffnadd(s2,s1, a0,a1, b1,b0); \ - adcs s3, s3, c; \ - adcs s4, s4, c; \ - adcs s5, s5, c; \ - adcs s6, s6, c; \ - adc s7, s7, c; \ + muldiffnadd(s2,s1, a0,a1, b1,b0) __LF \ + adcs s3, s3, c __LF \ + adcs s4, s4, c __LF \ + adcs s5, s5, c __LF \ + adcs s6, s6, c __LF \ + adc s7, s7, c __LF \ \ - muldiffnadd(s5,s4, a1,a3, b3,b1); \ - adcs s6, s6, c; \ - adc s7, s7, c; \ + muldiffnadd(s5,s4, a1,a3, b3,b1) __LF \ + adcs s6, s6, c __LF \ + adc s7, s7, c __LF \ \ - muldiffnadd(s3,s2, a0,a2, b2,b0); \ - adcs s4, s4, c; \ - adcs s5, s5, c; \ - adcs s6, s6, c; \ - adc s7, s7, c; \ + muldiffnadd(s3,s2, a0,a2, b2,b0) __LF \ + adcs s4, s4, c __LF \ + adcs s5, s5, c __LF \ + adcs s6, s6, c __LF \ + adc s7, s7, c __LF \ \ - muldiffnadd(s4,s3, a0,a3, b3,b0); \ - adcs s5, s5, c; \ - adcs s6, s6, c; \ - adc s7, s7, c; \ - muldiffnadd(s4,s3, a1,a2, b2,b1); \ - adcs s5, s5, c; \ - adcs s6, s6, c; \ + muldiffnadd(s4,s3, a0,a3, b3,b0) __LF \ + adcs s5, s5, c __LF \ + adcs s6, s6, c __LF \ + adc s7, s7, c __LF \ + muldiffnadd(s4,s3, a1,a2, b2,b1) __LF \ + adcs s5, s5, c __LF \ + adcs s6, s6, c __LF \ adc s7, s7, c \ -S2N_BN_SYMBOL(bignum_mul_p521): +S2N_BN_SYMBOL(bignum_mul_p521_base): // Save registers and make space for the temporary buffer diff --git a/third_party/s2n-bignum/arm/p521/bignum_sqr_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_sqr_p521_base.S similarity index 98% rename from third_party/s2n-bignum/arm/p521/bignum_sqr_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_sqr_p521_base.S index 404665258c4..937e7234ed4 100644 --- a/third_party/s2n-bignum/arm/p521/bignum_sqr_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/p521/unopt/bignum_sqr_p521_base.S @@ -5,14 +5,14 @@ // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced // Input x[9]; output z[9] // -// extern void bignum_sqr_p521 (uint64_t z[static 9], uint64_t x[static 9]); +// extern void bignum_sqr_p521_base (uint64_t z[static 9], uint64_t x[static 9]); // // Standard ARM ABI: X0 = z, X1 = x // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" - S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p521) - S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p521) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p521_base) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p521_base) .text .balign 4 @@ -56,7 +56,7 @@ #define d7 x9 #define d8 x10 -S2N_BN_SYMBOL(bignum_sqr_p521): +S2N_BN_SYMBOL(bignum_sqr_p521_base): // Save registers diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/Makefile new file mode 100644 index 00000000000..5ba07a7d7fc --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/Makefile @@ -0,0 +1,56 @@ +############################################################################# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +############################################################################# + +# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise +# use a cross-assembling version so that the code can still be assembled +# and the proofs checked against the object files (though you won't be able +# to run code without additional emulation infrastructure). The aarch64 +# cross-assembling version can be installed manually by something like: +# +# sudo apt-get install binutils-aarch64-linux-gnu + +UNAME_RESULT=$(shell uname -p) + +ifeq ($(UNAME_RESULT),aarch64) +GAS=as +else +GAS=aarch64-linux-gnu-as +endif + +# List of object files + +OBJ = bignum_add_p256k1.o \ + bignum_cmul_p256k1.o \ + bignum_deamont_p256k1.o \ + bignum_demont_p256k1.o \ + bignum_double_p256k1.o \ + bignum_half_p256k1.o \ + bignum_mod_n256k1_4.o \ + bignum_mod_p256k1_4.o \ + bignum_montmul_p256k1.o \ + bignum_montmul_p256k1_alt.o \ + bignum_montsqr_p256k1.o \ + bignum_montsqr_p256k1_alt.o \ + bignum_mul_p256k1.o \ + bignum_mul_p256k1_alt.o \ + bignum_neg_p256k1.o \ + bignum_optneg_p256k1.o \ + bignum_sqr_p256k1.o \ + bignum_sqr_p256k1_alt.o \ + bignum_sub_p256k1.o \ + bignum_tomont_p256k1.o \ + bignum_triple_p256k1.o \ + secp256k1_jadd.o \ + secp256k1_jadd_alt.o \ + secp256k1_jdouble.o \ + secp256k1_jdouble_alt.o \ + secp256k1_jmixadd.o \ + secp256k1_jmixadd_alt.o + +%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - + +default: $(OBJ); + +clean:; rm -f *.o *.correct diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_add_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_add_p256k1.S new file mode 100644 index 00000000000..7ca98e3e86e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_add_p256k1.S @@ -0,0 +1,79 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_add_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p256k1) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 +#define c x3 +#define l x4 +#define d0 x5 +#define d1 x6 +#define d2 x7 +#define d3 x8 +#define dd x9 + +S2N_BN_SYMBOL(bignum_add_p256k1): + +// First just add the numbers as z = x + y = 2^256 * c + [d3; d2; d1; d0] +// Also create dd = d3 AND d2 AND d1 to condense the later comparison + + ldp d0, d1, [x] + ldp l, c, [y] + adds d0, d0, l + adcs d1, d1, c + ldp d2, d3, [x, #16] + ldp l, c, [y, #16] + adcs d2, d2, l + and dd, d1, d2 + adcs d3, d3, c + and dd, dd, d3 + adc c, xzr, xzr + +// Let l = 4294968273 so that p_256k1 = 2^256 - l + + mov l, #977 + orr l, l, #0x100000000 + +// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256 + + adds xzr, d0, l + adcs xzr, dd, xzr + adcs c, c, xzr + +// Now c <> 0 <=> z >= p_256k1, so mask the constant l accordingly + + csel l, l, xzr, ne + +// If z >= p_256k1 do z := z - p_256k1, i.e. add l in 4 digits + + adds d0, d0, l + adcs d1, d1, xzr + adcs d2, d2, xzr + adc d3, d3, xzr + +// Store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_cmul_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_cmul_p256k1.S new file mode 100644 index 00000000000..b287742bd2b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_cmul_p256k1.S @@ -0,0 +1,95 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming +// x reduced +// Inputs c, x[4]; output z[4] +// +// extern void bignum_cmul_p256k1 +// (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = c, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256k1) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256k1_alt) + .text + .balign 4 + +#define z x0 +#define m x1 +#define x x2 + +#define d0 x3 +#define d1 x4 +#define d2 x5 +#define d3 x6 +#define a0 x7 +#define a1 x8 + +#define a2 x9 +#define c x9 + +#define a3 x10 +#define h x10 +#define q x10 + + +S2N_BN_SYMBOL(bignum_cmul_p256k1): + +S2N_BN_SYMBOL(bignum_cmul_p256k1_alt): + +// First do the multiply, straightforwardly to get [h;d3;d2;d1;d0] + + ldp a0, a1, [x] + ldp a2, a3, [x, #16] + mul d0, m, a0 + mul d1, m, a1 + mul d2, m, a2 + mul d3, m, a3 + umulh a0, m, a0 + umulh a1, m, a1 + umulh a2, m, a2 + umulh h, m, a3 + adds d1, d1, a0 + adcs d2, d2, a1 + adcs d3, d3, a2 + adcs h, h, xzr + +// Now the quotient estimate is q = h + 1, and then we do the reduction, +// writing z = [d3;d2;d1;d0], as z' = (2^256 * h + z) - q * p_256k1 = +// (2^256 * h + z) - q * (2^256 - 4294968273) = -2^256 + (z + 4294968273 * q) + + add q, h, #1 + mov c, #977 + orr c, c, #0x100000000 + mul a0, q, c + umulh a1, q, c + adds d0, d0, a0 + adcs d1, d1, a1 + adcs d2, d2, xzr + adcs d3, d3, xzr + +// Because of the implicit -2^256, CF means >= 0 so z' is the answer; ~CF +// means z' < 0 so we add p_256k1, which in 4 digits means subtracting c. + + csel c, c, xzr, cc + subs d0, d0, c + sbcs d1, d1, xzr + sbcs d2, d2, xzr + sbc d3, d3, xzr + +// Finally store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_deamont_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_deamont_p256k1.S new file mode 100644 index 00000000000..245b433844a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_deamont_p256k1.S @@ -0,0 +1,110 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from Montgomery form z := (x / 2^256) mod p_256k1, +// Input x[4]; output z[4] +// +// extern void bignum_deamont_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form, +// "almost" meaning any 4-digit input will work, with no range restriction. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p256k1) + .text + .balign 4 + +// Input parameters + +#define z x0 +#define x x1 + +// Rotating registers for the intermediate windows + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 + +// Other temporaries + +#define t x6 +#define w x7 +#define c x8 +#define dd x9 + +S2N_BN_SYMBOL(bignum_deamont_p256k1): + +// Load input and set up constants c = 4294968273 so p_256k1 = 2^256 - c, +// and w the negated multiplicative inverse p_256k1 * w == -1 (mod 2^64). + + ldp d0, d1, [x] + movz w, #0x3531 + movk w, #0xd225, lsl #16 + ldp d2, d3, [x, #16] + movk w, #0x091d, lsl #32 + movk w, #0xd838, lsl #48 + mov c, #977 + orr c, c, #0x100000000 + +// Four stages of Montgomery reduction, rotating the register window +// Let dd be the AND of all 4 words of the cofactor q as it is computed + + mul d0, w, d0 + umulh t, d0, c + subs d1, d1, t + + mul d1, w, d1 + umulh t, d1, c + and dd, d0, d1 + sbcs d2, d2, t + + mul d2, w, d2 + umulh t, d2, c + and dd, dd, d2 + sbcs d3, d3, t + + mul d3, w, d3 + umulh t, d3, c + and dd, dd, d3 + sbcs d0, d0, t + + sbcs d1, d1, xzr + sbcs d2, d2, xzr + sbc d3, d3, xzr + +// The result thus far is z = (x + q * p_256k1) / 2^256. Note that +// z < p_256k1 <=> x < (2^256 - q) * p_256k1, and since +// x < 2^256 < 2 * p_256k1, we have that *if* q < 2^256 - 1 then +// z < p_256k1. Conversely if q = 2^256 - 1 then since +// x + q * p_256k1 == 0 (mod 2^256) we have x == p_256k1 (mod 2^256) +// and thus x = p_256k1, and z >= p_256k1 (in fact z = p_256k1). +// So in summary z < p_256k1 <=> ~(q = 2^256 - 1) <=> ~(x = p_256k1). +// and hence iff q is all 1s, or equivalently dd is all 1s, we +// correct by subtracting p_256k1 to get 0. Since this is only one +// case we compute the result more explicitly rather than doing +// arithmetic with carry propagation. + + add c, c, d0 + cmp dd, #-1 + csel d0, c, d0, eq + csel d1, xzr, d1, eq + csel d2, xzr, d2, eq + csel d3, xzr, d3, eq + +// Write back result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_demont_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_demont_p256k1.S new file mode 100644 index 00000000000..bbea9c18c50 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_demont_p256k1.S @@ -0,0 +1,87 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from Montgomery form z := (x / 2^256) mod p_256k1, +// assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_demont_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// This assumes the input is < p_256k1 for correctness. If this is not the +// case, use the variant "bignum_deamont_p256k1" instead. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_p256k1) + .text + .balign 4 + +// Input parameters + +#define z x0 +#define x x1 + +// Rotating registers for the intermediate windows + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 + +// Other temporaries + +#define t x6 +#define w x7 +#define c x8 + +S2N_BN_SYMBOL(bignum_demont_p256k1): + +// Load input and set up constants c = 4294968273 so p_256k1 = 2^256 - c, +// and w the negated multiplicative inverse p_256k1 * w == -1 (mod 2^64). + + ldp d0, d1, [x] + movz w, #0x3531 + movk w, #0xd225, lsl #16 + ldp d2, d3, [x, #16] + movk w, #0x091d, lsl #32 + movk w, #0xd838, lsl #48 + mov c, #977 + orr c, c, #0x100000000 + +// Four stages of Montgomery reduction, rotating the register window + + mul d0, w, d0 + umulh t, d0, c + subs d1, d1, t + + mul d1, w, d1 + umulh t, d1, c + sbcs d2, d2, t + + mul d2, w, d2 + umulh t, d2, c + sbcs d3, d3, t + + mul d3, w, d3 + umulh t, d3, c + sbcs d0, d0, t + + sbcs d1, d1, xzr + sbcs d2, d2, xzr + sbc d3, d3, xzr + +// Write back result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_double_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_double_p256k1.S new file mode 100644 index 00000000000..b54c46323c6 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_double_p256k1.S @@ -0,0 +1,76 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_double_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_p256k1) + .text + .balign 4 + +#define z x0 +#define x x1 +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define c x6 +#define dd x7 +#define l x8 + +S2N_BN_SYMBOL(bignum_double_p256k1): + +// Load the inputs and double top-down as z = 2^256 * c + [d3;d2;d1;d0] +// While doing this, create an AND dd of [d3;d2;d1] to condense comparison + + ldp d2, d3, [x, #16] + lsr c, d3, #63 + extr d3, d3, d2, #63 + ldp d0, d1, [x] + extr d2, d2, d1, #63 + and dd, d2, d3 + extr d1, d1, d0, #63 + and dd, dd, d1 + lsl d0, d0, #1 + +// Let l = 4294968273 so that p_256k1 = 2^256 - l + + mov l, #977 + orr l, l, #0x100000000 + +// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256 + + adds xzr, d0, l + adcs xzr, dd, xzr + adcs c, c, xzr + +// Now c <> 0 <=> z >= p_256k1, so mask the constant l accordingly + + csel l, l, xzr, ne + +// If z >= p_256k1 do z := z - p_256k1, i.e. add l in 4 digits + + adds d0, d0, l + adcs d1, d1, xzr + adcs d2, d2, xzr + adc d3, d3, xzr + +// Store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_half_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_half_p256k1.S new file mode 100644 index 00000000000..70d9ced6f29 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_half_p256k1.S @@ -0,0 +1,68 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_half_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_half_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_half_p256k1) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define c x6 + +S2N_BN_SYMBOL(bignum_half_p256k1): + +// Load the 4 digits of x + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Let b be the LSB of the input (i.e. whether it is odd). +// Create c = 4294968273 * b + + mov c, #977 + orr c, c, #0x100000000 + tst d0, #1 + csel c, c, xzr, ne + +// We want (x + b * p_256k1) / 2 where b is that LSB, in {0,1}. +// That amounts to (2^256 * b + x - 4294968273 * b) / 2, and +// modulo 4 words that's the same as ([2^256 * c + x] - c) / 2. +// So do that subtraction and shift a place right as we go. + + subs d0, d0, c + sbcs d1, d1, xzr + extr d0, d1, d0, #1 + sbcs d2, d2, xzr + extr d1, d2, d1, #1 + sbcs d3, d3, xzr + extr d2, d3, d2, #1 + sbc c, c, xzr + extr d3, c, d3, #1 + +// Store back and return + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mod_n256k1_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mod_n256k1_4.S new file mode 100644 index 00000000000..16109d8bb37 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mod_n256k1_4.S @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod n_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_mod_n256k1_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Reduction is modulo the group order of the secp256k1 curve. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256k1_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256k1_4) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define n0 x2 +#define n1 x3 +#define n2 x4 +#define n3 x5 + +#define d0 x6 +#define d1 x7 +#define d2 x8 +#define d3 x9 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(bignum_mod_n256k1_4): + +// Load the complicated three words of n_256k1, the other being all 1s + + movbig( n0, #0xbfd2, #0x5e8c, #0xd036, #0x4141) + movbig( n1, #0xbaae, #0xdce6, #0xaf48, #0xa03b) + mov n2, 0xFFFFFFFFFFFFFFFE + +// Load the input number + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Do the subtraction. Since word 3 of n_256k1 is all 1s, that can be +// done by adding zero with carry, thanks to the inverted carry. + + subs n0, d0, n0 + sbcs n1, d1, n1 + sbcs n2, d2, n2 + adcs n3, d3, xzr + +// Now if the carry is *clear* (inversion at work) the subtraction carried +// and hence we should have done nothing, so we reset each n_i = d_i + + csel n0, d0, n0, cc + csel n1, d1, n1, cc + csel n2, d2, n2, cc + csel n3, d3, n3, cc + +// Store the end result + + stp n0, n1, [z] + stp n2, n3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mod_p256k1_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mod_p256k1_4.S new file mode 100644 index 00000000000..6fb3ad133a8 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mod_p256k1_4.S @@ -0,0 +1,65 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_mod_p256k1_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256k1_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256k1_4) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define d x6 +#define c x7 + +S2N_BN_SYMBOL(bignum_mod_p256k1_4): + +// Load the inputs as [d3;d2;d1;d0] and let d be an AND of [d3;d2;d1] to +// condense the comparison below. + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + and d, d1, d2 + and d, d, d3 + +// Compare x >= p_256k1 = 2^256 - 4294968273 using condensed carry: +// we get a carry from the lowest digit and all other digits are 1. +// We end up with c and d as adjusted digits for x - p_256k1 if so. + + mov c, #977 + orr c, c, #0x100000000 + adds c, c, d0 + adcs d, d, xzr + +// If indeed x >= p_256k1 then x := x - p_256k1, using c and d + + csel d0, d0, c, cc + csel d1, d1, d, cc + csel d2, d2, d, cc + csel d3, d3, d, cc + +// Store the end result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montmul_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montmul_p256k1.S new file mode 100644 index 00000000000..50f0691d3ee --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montmul_p256k1.S @@ -0,0 +1,278 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_256k1 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_256k1, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_256k1 (in particular this is true if we are in +// the "usual" case x < p_256k1 and y < p_256k1). +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256k1) + .text + .balign 4 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +// --------------------------------------------------------------------------- +// Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z) +// c,h,l,t should all be different +// t,h should not overlap w,z +// --------------------------------------------------------------------------- + +#define muldiffn(c,h,l, t, x,y, w,z) \ + subs t, x, y __LF \ + cneg t, t, cc __LF \ + csetm c, cc __LF \ + subs h, w, z __LF \ + cneg h, h, cc __LF \ + mul l, t, h __LF \ + umulh h, t, h __LF \ + cinv c, c, cc __LF \ + eor l, l, c __LF \ + eor h, h, c + +// --------------------------------------------------------------------------- +// Core two-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d5;d4;d3;d2], modifying the +// existing contents of [d3;d2;d1] and generating d4 and d5, while +// using t1, t2, and t3 as temporaries. It's OK if d4 == d0 and d5 == d1. +// --------------------------------------------------------------------------- + +#define montreds2(d5,d4,d3,d2,d1,d0) \ + movbig(t2, 0xd838, #0x091d, #0xd225, #0x3531) __LF \ + mul d4, t2, d0 __LF \ + mov t3, #977 __LF \ + orr t3, t3, #0x100000000 __LF \ + umulh t1, d4, t3 __LF \ + subs d1, d1, t1 __LF \ + mul d5, t2, d1 __LF \ + umulh t1, d5, t3 __LF \ + sbcs d2, d2, t1 __LF \ + sbcs d3, d3, xzr __LF \ + sbcs d4, d4, xzr __LF \ + sbc d5, d5, xzr + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 +#define b0 x7 +#define b1 x8 +#define b2 x9 +#define b3 x10 + +#define s0 x11 +#define s1 x12 +#define s2 x13 +#define s3 x14 +#define t0 x15 +#define t1 x16 +#define t2 x17 +#define t3 x1 +#define s4 x2 + +S2N_BN_SYMBOL(bignum_montmul_p256k1): + +// Load in all words of both inputs + + ldp a0, a1, [x1] + ldp a2, a3, [x1, #16] + ldp b0, b1, [x2] + ldp b2, b3, [x2, #16] + +// Multiply low halves with a 2x2->4 ADK multiplier as L = [s3;s2;s1;s0] + + mul s0, a0, b0 + mul s2, a1, b1 + umulh s1, a0, b0 + adds t1, s0, s2 + umulh s3, a1, b1 + adcs t2, s1, s3 + adcs s3, s3, xzr + adds s1, s1, t1 + adcs s2, s2, t2 + adcs s3, s3, xzr + muldiffn(t3,t2,t1, t0, a0,a1, b1,b0) + adds xzr, t3, #1 + adcs s1, s1, t1 + adcs s2, s2, t2 + adc s3, s3, t3 + +// Perform two "short" Montgomery steps on the low product to +// get a modified low result L' = [s1;s0;s3;s2] +// This shifts it to an offset compatible with middle terms +// Stash the result L' temporarily in the output buffer to avoid +// using additional registers. + + montreds2(s1,s0,s3,s2,s1,s0) + + stp s2, s3, [x0] + stp s0, s1, [x0, #16] + +// Multiply high halves with a 2x2->4 ADK multiplier as H = [s3;s2;s1;s0] + + mul s0, a2, b2 + mul s2, a3, b3 + umulh s1, a2, b2 + adds t1, s0, s2 + umulh s3, a3, b3 + adcs t2, s1, s3 + adcs s3, s3, xzr + adds s1, s1, t1 + adcs s2, s2, t2 + adcs s3, s3, xzr + muldiffn(t3,t2,t1, t0, a2,a3, b3,b2) + adds xzr, t3, #1 + adcs s1, s1, t1 + adcs s2, s2, t2 + adc s3, s3, t3 + +// Compute sign-magnitude a2,[a1,a0] = x_hi - x_lo + + subs a0, a2, a0 + sbcs a1, a3, a1 + sbc a2, xzr, xzr + adds xzr, a2, #1 + eor a0, a0, a2 + adcs a0, a0, xzr + eor a1, a1, a2 + adcs a1, a1, xzr + +// Compute sign-magnitude b2,[b1,b0] = y_lo - y_hi + + subs b0, b0, b2 + sbcs b1, b1, b3 + sbc b2, xzr, xzr + adds xzr, b2, #1 + eor b0, b0, b2 + adcs b0, b0, xzr + eor b1, b1, b2 + adcs b1, b1, xzr + +// Save the correct sign for the sub-product in b3 + + eor b3, a2, b2 + +// Add the high H to the modified low term L' as H + L' = [s4;b2;a2;t3;t0] + + ldp t0, t3, [x0] + adds t0, s0, t0 + adcs t3, s1, t3 + ldp a2, b2, [x0, #16] + adcs a2, s2, a2 + adcs b2, s3, b2 + adc s4, xzr, xzr + +// Multiply with yet a third 2x2->4 ADK multiplier for complex mid-term M + + mul s0, a0, b0 + mul s2, a1, b1 + umulh s1, a0, b0 + adds t1, s0, s2 + umulh s3, a1, b1 + adcs t2, s1, s3 + adcs s3, s3, xzr + adds s1, s1, t1 + adcs s2, s2, t2 + adcs s3, s3, xzr + muldiffn(a1,t2,t1, a0, a0,a1, b1,b0) + adds xzr, a1, #1 + adcs s1, s1, t1 + adcs s2, s2, t2 + adc s3, s3, a1 + +// Set up a sign-modified version of the mid-product in a long accumulator +// as [b3;a1;a0;s3;s2;s1;s0], adding in the H + L' term once with +// zero offset as this signed value is created + + adds xzr, b3, #1 + eor s0, s0, b3 + adcs s0, s0, t0 + eor s1, s1, b3 + adcs s1, s1, t3 + eor s2, s2, b3 + adcs s2, s2, a2 + eor s3, s3, b3 + adcs s3, s3, b2 + adcs a0, s4, b3 + adcs a1, b3, xzr + adc b3, b3, xzr + +// Add in the stashed H + L' term an offset of 2 words as well + + adds s2, s2, t0 + adcs s3, s3, t3 + adcs a0, a0, a2 + adcs a1, a1, b2 + adc b3, b3, s4 + +// Do two more Montgomery steps on the composed term +// Net pre-reduct is in [b3;a1;a0;s3;s2] + + montreds2(s1,s0,s3,s2,s1,s0) + +// Finish addition and form condensed upper digits as "dd" + +#define dd b1 + + adds a0, a0, s0 + and dd, s3, a0 + adcs a1, a1, s1 + and dd, dd, a1 + adc b3, b3, xzr + +// Because of the way we added L' in two places, we can overspill by +// more than usual in Montgomery, with the result being only known to +// be < 3 * p_256k1, not the usual < 2 * p_256k1. So now we do a more +// elaborate final correction, making use of the condensed carry dd +// to see if the initial estimate q = 4294968273 * (h + 1) results +// in a negative true result, and if so use q = 4294968273 * h. + +#define d0 s2 +#define d1 s3 +#define d2 a0 +#define d3 a1 +#define h b3 + +#define q s4 +#define c b0 + + madd q, h, t3, t3 + + adds xzr, d0, q + sub h, q, t3 + adcs xzr, dd, xzr + + csel q, q, h, cs + + adds d0, d0, q + adcs d1, d1, xzr + adcs d2, d2, xzr + adc d3, d3, xzr + +// Finally store the result + + stp d0, d1, [x0] + stp d2, d3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montmul_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montmul_p256k1_alt.S new file mode 100644 index 00000000000..0383075c54b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montmul_p256k1_alt.S @@ -0,0 +1,233 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_256k1 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_p256k1_alt +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_256k1, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_256k1 (in particular this is true if we are in +// the "usual" case x < p_256k1 and y < p_256k1). +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256k1_alt) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 +#define b0 x7 +#define b1 x8 +#define b2 x9 +#define b3 x10 + +#define l x11 + +#define u0 x12 +#define u1 x13 +#define u2 x14 +#define u3 x15 +#define u4 x16 + +#define c x17 + +// These alias to the input arguments when no longer needed + +#define u5 a0 +#define u6 a1 +#define u7 a2 + +#define w x1 +#define t x2 +#define uu b3 + +S2N_BN_SYMBOL(bignum_montmul_p256k1_alt): + +// Load operands and set up row 0 = [u4;...;u0] = a0 * [b3;...;b0] + + ldp a0, a1, [x] + ldp b0, b1, [y] + + mul u0, a0, b0 + umulh u1, a0, b0 + mul l, a0, b1 + umulh u2, a0, b1 + adds u1, u1, l + + ldp b2, b3, [y, #16] + + mul l, a0, b2 + umulh u3, a0, b2 + adcs u2, u2, l + + mul l, a0, b3 + umulh u4, a0, b3 + adcs u3, u3, l + adc u4, u4, xzr + + ldp a2, a3, [x, #16] + +// Start the Montgomery reductions now to interleave better, though +// conceptually they all happen after the multiplication, only modifying +// any u_i when the multiplication process no longer uses it. Set up +// constants c = 4294968273 so that p_256k1 = 2^256 - c, and w the negated +// multiplicative inverse so that p_256k1 * w == -1 (mod 2^64). + + movz w, #0x3531 + movk w, #0xd225, lsl #16 + movk w, #0x091d, lsl #32 + movk w, #0xd838, lsl #48 + mov c, #977 + orr c, c, #0x100000000 + +// Precompute this part ahead of the main Montgomery stage. This +// is a repeated pattern below, since it seems to slightly improve +// dependent latencies. + + mul u0, w, u0 + +// Row 1 = [u5;...;u0] = [a1;a0] * [b3;...;b0] + + mul l, a1, b0 + adds u1, u1, l + mul l, a1, b1 + adcs u2, u2, l + mul l, a1, b2 + adcs u3, u3, l + mul l, a1, b3 + adcs u4, u4, l + umulh u5, a1, b3 + adc u5, u5, xzr + + umulh l, a1, b0 + adds u2, u2, l + umulh l, a1, b1 + adcs u3, u3, l + umulh l, a1, b2 + adcs u4, u4, l + adc u5, u5, xzr + +// Montgomery stage 0; use t to record the suspended carry + + umulh l, u0, c + subs u1, u1, l + cset t, cc + +// Row 2 = [u6;...;u0] = [a2;a1;a0] * [b3;...;b0] + + mul l, a2, b0 + adds u2, u2, l + mul l, a2, b1 + adcs u3, u3, l + mul l, a2, b2 + adcs u4, u4, l + mul l, a2, b3 + adcs u5, u5, l + umulh u6, a2, b3 + adc u6, u6, xzr + + mul u1, w, u1 + + umulh l, a2, b0 + adds u3, u3, l + umulh l, a2, b1 + adcs u4, u4, l + umulh l, a2, b2 + adcs u5, u5, l + adc u6, u6, xzr + +// Montgomery stage 1 + + umulh l, u1, c + add l, l, t + subs u2, u2, l + cset t, cc + +// Row 3 = [u7;...;u0] = [a3;...a0] * [b3;...;b0] + + mul l, a3, b0 + adds u3, u3, l + mul l, a3, b1 + adcs u4, u4, l + mul l, a3, b2 + adcs u5, u5, l + mul l, a3, b3 + adcs u6, u6, l + umulh u7, a3, b3 + adc u7, u7, xzr + + mul u2, w, u2 + + umulh l, a3, b0 + adds u4, u4, l + umulh l, a3, b1 + adcs u5, u5, l + umulh l, a3, b2 + adcs u6, u6, l + adc u7, u7, xzr + +// Montgomery stages 2 and 3 (no longer using t to link the carries). + + umulh l, u2, c + add l, l, t + subs u3, u3, l + mul u3, w, u3 + umulh l, u3, c + sbcs u0, u0, l + sbcs u1, u1, xzr + sbcs u2, u2, xzr + sbc u3, u3, xzr + +// Now if a * b = 2^256 * h + l is the full product, we now have +// [u7;u6;u5;u4] = h and 2^256 * [u3;u2;u1;u0] == l (mod p_256k1) because +// of the Montgomery reductions on the low half. Now add the high part +// and the Montgomery-reduced low part. + + adds u0, u0, u4 + adcs u1, u1, u5 + adcs u2, u2, u6 + and uu, u1, u2 + adcs u3, u3, u7 + and uu, uu, u3 + cset t, cs + +// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256 + + adds xzr, u0, c + adcs xzr, uu, xzr + adcs t, t, xzr + +// Now t <> 0 <=> z >= p_256k1, so mask the constant c accordingly + + csel c, c, xzr, ne + +// If z >= p_256k1 do z := z - p_256k1, i.e. add c in 4 digits + + adds u0, u0, c + adcs u1, u1, xzr + adcs u2, u2, xzr + adc u3, u3, xzr + +// Write back + + stp u0, u1, [z] + stp u2, u3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montsqr_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montsqr_p256k1.S new file mode 100644 index 00000000000..1fe1f12a680 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montsqr_p256k1.S @@ -0,0 +1,183 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_256k1, assuming x^2 <= 2^256 * p_256k1, which +// is guaranteed in particular if x < p_256k1 initially (the "intended" case). +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256k1) + .text + .balign 4 + +#define z x0 +#define x x1 + +// Variables + +#define u0 x2 +#define u1 x3 +#define u2 x4 +#define u3 x5 +#define u4 x6 +#define u5 x7 +#define u6 x8 +#define u7 x9 + +#define w x10 +#define c x11 +#define t x12 +#define uu x13 + +S2N_BN_SYMBOL(bignum_montsqr_p256k1): + +// First just a near-clone of bignum_sqr_4_8 to get the square, using +// different registers to collect full product without writeback. + + ldp u4, u5, [x] + ldp x10, x11, [x, #16] + mul u2, u4, x10 + mul u7, u5, x11 + umulh x12, u4, x10 + subs x13, u4, u5 + cneg x13, x13, cc + csetm u1, cc + subs u0, x11, x10 + cneg u0, u0, cc + mul u6, x13, u0 + umulh u0, x13, u0 + cinv u1, u1, cc + eor u6, u6, u1 + eor u0, u0, u1 + adds u3, u2, x12 + adc x12, x12, xzr + umulh x13, u5, x11 + adds u3, u3, u7 + adcs x12, x12, x13 + adc x13, x13, xzr + adds x12, x12, u7 + adc x13, x13, xzr + cmn u1, #0x1 + adcs u3, u3, u6 + adcs x12, x12, u0 + adc x13, x13, u1 + adds u2, u2, u2 + adcs u3, u3, u3 + adcs x12, x12, x12 + adcs x13, x13, x13 + adc x14, xzr, xzr + mul u0, u4, u4 + mul u6, u5, u5 + mul x15, u4, u5 + umulh u1, u4, u4 + umulh u7, u5, u5 + umulh x16, u4, u5 + adds u1, u1, x15 + adcs u6, u6, x16 + adc u7, u7, xzr + adds u1, u1, x15 + adcs u6, u6, x16 + adc u7, u7, xzr + adds u2, u2, u6 + adcs u3, u3, u7 + adcs x12, x12, xzr + adcs x13, x13, xzr + adc x14, x14, xzr + mul u4, x10, x10 + mul u6, x11, x11 + mul x15, x10, x11 + umulh u5, x10, x10 + umulh u7, x11, x11 + umulh x16, x10, x11 + adds u5, u5, x15 + adcs u6, u6, x16 + adc u7, u7, xzr + adds u5, u5, x15 + adcs u6, u6, x16 + adc u7, u7, xzr + adds u4, u4, x12 + adcs u5, u5, x13 + adcs u6, u6, x14 + adc u7, u7, xzr + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0]. Set up constants +// c = 4294968273 so that p_256k1 = 2^256 - c, and w the negated +// multiplicative inverse so that p_256k1 * w == -1 (mod 2^64). + + movz w, #0x3531 + movk w, #0xd225, lsl #16 + movk w, #0x091d, lsl #32 + movk w, #0xd838, lsl #48 + mov c, #977 + orr c, c, #0x100000000 + +// Do 4 iterations of Montgomery reduction, rotating [u3;u2;u1;u0] + + mul u0, w, u0 + umulh t, u0, c + subs u1, u1, t + + mul u1, w, u1 + umulh t, u1, c + sbcs u2, u2, t + + mul u2, w, u2 + umulh t, u2, c + sbcs u3, u3, t + + mul u3, w, u3 + umulh t, u3, c + sbcs u0, u0, t + + sbcs u1, u1, xzr + sbcs u2, u2, xzr + sbc u3, u3, xzr + +// Add the high part and the Montgomery reduced low part + + adds u0, u0, u4 + adcs u1, u1, u5 + adcs u2, u2, u6 + and uu, u1, u2 + adcs u3, u3, u7 + and uu, uu, u3 + cset t, cs + +// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256 + + adds xzr, u0, c + adcs xzr, uu, xzr + adcs t, t, xzr + +// Now t <> 0 <=> z >= p_256k1, so mask the constant c accordingly + + csel c, c, xzr, ne + +// If z >= p_256k1 do z := z - p_256k1, i.e. add c in 4 digits + + adds u0, u0, c + adcs u1, u1, xzr + adcs u2, u2, xzr + adc u3, u3, xzr + +// Write back + + stp u0, u1, [x0] + stp u2, u3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montsqr_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montsqr_p256k1_alt.S new file mode 100644 index 00000000000..f4d141c9256 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_montsqr_p256k1_alt.S @@ -0,0 +1,194 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_p256k1_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_256k1, assuming x^2 <= 2^256 * p_256k1, which +// is guaranteed in particular if x < p_256k1 initially (the "intended" case). +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256k1_alt) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 + +#define h x6 +#define l x7 + +#define u0 x8 +#define u1 x9 +#define u2 x10 +#define u3 x11 +#define u4 x12 +#define u5 x13 +#define u6 x14 +#define u7 x15 + +// Just aliases (we only use w after loading the inputs) + +#define w x +#define t h +#define c a0 +#define uu a1 + +S2N_BN_SYMBOL(bignum_montsqr_p256k1_alt): + +// Load all the elements, set up an initial window [u6;...u1] = [23;03;01] +// and chain in the addition of 02 + 12 + 13 (no carry-out is possible). +// This gives all the "heterogeneous" terms of the squaring ready to double + + ldp a0, a1, [x] + + mul u1, a0, a1 + umulh u2, a0, a1 + + ldp a2, a3, [x, #16] + + mul u3, a0, a3 + umulh u4, a0, a3 + + mul l, a0, a2 + umulh h, a0, a2 + adds u2, u2, l + + adcs u3, u3, h + mul l, a1, a2 + umulh h, a1, a2 + adc h, h, xzr + adds u3, u3, l + + mul u5, a2, a3 + umulh u6, a2, a3 + + adcs u4, u4, h + mul l, a1, a3 + umulh h, a1, a3 + adc h, h, xzr + adds u4, u4, l + + adcs u5, u5, h + adc u6, u6, xzr + +// Now just double it; this simple approach seems to work better than extr + + adds u1, u1, u1 + adcs u2, u2, u2 + adcs u3, u3, u3 + adcs u4, u4, u4 + adcs u5, u5, u5 + adcs u6, u6, u6 + cset u7, cs + +// Add the homogeneous terms 00 + 11 + 22 + 33 + + umulh l, a0, a0 + mul u0, a0, a0 + adds u1, u1, l + +// Start the Montgomery reductions now to interleave better, though +// conceptually they all happen after the multiplication, only modifying +// any u_i when the multiplication process no longer uses it. Set up +// constants c = 4294968273 so that p_256k1 = 2^256 - c, and w the negated +// multiplicative inverse so that p_256k1 * w == -1 (mod 2^64). +// Precompute a little ahead of the main Montgomery stage. + + movz w, #0x3531 + movk w, #0xd225, lsl #16 + movk w, #0x091d, lsl #32 + movk w, #0xd838, lsl #48 + mov c, #977 + orr c, c, #0x100000000 + mul u0, w, u0 + + mul l, a1, a1 + adcs u2, u2, l + umulh l, a1, a1 + adcs u3, u3, l + + mul l, a2, a2 + adcs u4, u4, l + umulh l, a2, a2 + adcs u5, u5, l + + mul l, a3, a3 + adcs u6, u6, l + umulh l, a3, a3 + adc u7, u7, l + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0']. We actually precomputed +// the Montgomery multiplier in u0, but otherwise continue with +// 4 iterations of Montgomery reduction, rotating [u3;u2;u1;u0] + + umulh l, u0, c + subs u1, u1, l + + mul u1, w, u1 + umulh l, u1, c + sbcs u2, u2, l + + mul u2, w, u2 + umulh l, u2, c + sbcs u3, u3, l + + mul u3, w, u3 + umulh l, u3, c + sbcs u0, u0, l + + sbcs u1, u1, xzr + sbcs u2, u2, xzr + sbc u3, u3, xzr + +// Add the high part and the Montgomery reduced low part + + adds u0, u0, u4 + adcs u1, u1, u5 + adcs u2, u2, u6 + and uu, u1, u2 + adcs u3, u3, u7 + and uu, uu, u3 + cset t, cs + +// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256 + + adds xzr, u0, c + adcs xzr, uu, xzr + adcs t, t, xzr + +// Now t <> 0 <=> z >= p_256k1, so mask the constant c accordingly + + csel c, c, xzr, ne + +// If z >= p_256k1 do z := z - p_256k1, i.e. add c in 4 digits + + adds u0, u0, c + adcs u1, u1, xzr + adcs u2, u2, xzr + adc u3, u3, xzr + +// Write back + + stp u0, u1, [z] + stp u2, u3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mul_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mul_p256k1.S new file mode 100644 index 00000000000..6b2b5aee802 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mul_p256k1.S @@ -0,0 +1,302 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply modulo p_256k1, z := (x * y) mod p_256k1 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_mul_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p256k1) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 + +#define a0 x3 +#define a1 x4 +#define b0 x5 +#define b1 x6 + +#define u0 x7 +#define u1 x8 +#define u2 x9 +#define u3 x10 +#define u4 x11 +#define u5 x12 +#define u6 x13 +#define u7 x14 + +#define t x15 + +#define sgn x16 +#define ysgn x17 + +// These are aliases to registers used elsewhere including input pointers. +// By the time they are used this does not conflict with other uses. + +#define m0 y +#define m1 ysgn +#define m2 t +#define m3 x +#define u u2 + +// For the reduction stages, again aliasing other things + +#define c x1 +#define h x2 +#define l x15 +#define d x16 +#define q x17 +#define a2 x11 +#define a3 x12 +#define b2 x13 +#define b3 x14 + +S2N_BN_SYMBOL(bignum_mul_p256k1): + +// Multiply the low halves using Karatsuba 2x2->4 to get [u3,u2,u1,u0] + + ldp a0, a1, [x] + ldp b0, b1, [y] + + mul u0, a0, b0 + umulh u1, a0, b0 + mul u2, a1, b1 + umulh u3, a1, b1 + + subs a1, a1, a0 + cneg a1, a1, cc + csetm sgn, cc + + adds u2, u2, u1 + adc u3, u3, xzr + + subs a0, b0, b1 + cneg a0, a0, cc + cinv sgn, sgn, cc + + mul t, a1, a0 + umulh a0, a1, a0 + + adds u1, u0, u2 + adcs u2, u2, u3 + adc u3, u3, xzr + + adds xzr, sgn, #1 + eor t, t, sgn + adcs u1, t, u1 + eor a0, a0, sgn + adcs u2, a0, u2 + adc u3, u3, sgn + +// Multiply the high halves using Karatsuba 2x2->4 to get [u7,u6,u5,u4] + + ldp a0, a1, [x, #16] + ldp b0, b1, [y, #16] + + mul u4, a0, b0 + umulh u5, a0, b0 + mul u6, a1, b1 + umulh u7, a1, b1 + + subs a1, a1, a0 + cneg a1, a1, cc + csetm sgn, cc + + adds u6, u6, u5 + adc u7, u7, xzr + + subs a0, b0, b1 + cneg a0, a0, cc + cinv sgn, sgn, cc + + mul t, a1, a0 + umulh a0, a1, a0 + + adds u5, u4, u6 + adcs u6, u6, u7 + adc u7, u7, xzr + + adds xzr, sgn, #1 + eor t, t, sgn + adcs u5, t, u5 + eor a0, a0, sgn + adcs u6, a0, u6 + adc u7, u7, sgn + +// Compute sgn,[a1,a0] = x_hi - x_lo +// and ysgn,[b1,b0] = y_lo - y_hi +// sign-magnitude differences + + ldp a0, a1, [x, #16] + ldp t, sgn, [x] + subs a0, a0, t + sbcs a1, a1, sgn + csetm sgn, cc + + ldp t, ysgn, [y] + subs b0, t, b0 + sbcs b1, ysgn, b1 + csetm ysgn, cc + + eor a0, a0, sgn + subs a0, a0, sgn + eor a1, a1, sgn + sbc a1, a1, sgn + + eor b0, b0, ysgn + subs b0, b0, ysgn + eor b1, b1, ysgn + sbc b1, b1, ysgn + +// Save the correct sign for the sub-product + + eor sgn, ysgn, sgn + +// Add H' = H + L_top, still in [u7,u6,u5,u4] + + adds u4, u4, u2 + adcs u5, u5, u3 + adcs u6, u6, xzr + adc u7, u7, xzr + +// Now compute the mid-product as [m3,m2,m1,m0] + + mul m0, a0, b0 + umulh m1, a0, b0 + mul m2, a1, b1 + umulh m3, a1, b1 + + subs a1, a1, a0 + cneg a1, a1, cc + csetm u, cc + + adds m2, m2, m1 + adc m3, m3, xzr + + subs b1, b0, b1 + cneg b1, b1, cc + cinv u, u, cc + + mul b0, a1, b1 + umulh b1, a1, b1 + + adds m1, m0, m2 + adcs m2, m2, m3 + adc m3, m3, xzr + + adds xzr, u, #1 + eor b0, b0, u + adcs m1, b0, m1 + eor b1, b1, u + adcs m2, b1, m2 + adc m3, m3, u + +// Accumulate the positive mid-terms as [u7,u6,u5,u4,u3,u2] + + adds u2, u4, u0 + adcs u3, u5, u1 + adcs u4, u6, u4 + adcs u5, u7, u5 + adcs u6, u6, xzr + adc u7, u7, xzr + +// Add in the sign-adjusted complex term + + adds xzr, sgn, #1 + eor m0, m0, sgn + adcs u2, m0, u2 + eor m1, m1, sgn + adcs u3, m1, u3 + eor m2, m2, sgn + adcs u4, m2, u4 + eor m3, m3, sgn + adcs u5, m3, u5 + adcs u6, u6, sgn + adc u7, u7, sgn + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0] +// and this is == 4294968273 * h + l (mod p_256k1) +// Some of the word products are done straightforwardly using mul + umulh +// while others are broken down in a more complicated way as +// (2^32 + 977) * (2^32 * h + l) = 2^64 * h + 2^32 * (d * h + l) + d * l + + mov d, #977 + orr c, d, #0x100000000 + + mul a0, c, u4 + umulh b0, c, u4 + + and l, u5, #0xFFFFFFFF + lsr h, u5, #32 + mul a1, d, l + madd l, d, h, l + adds a1, a1, l, lsl #32 + lsr l, l, #32 + adc b1, h, l + + mul a2, c, u6 + umulh b2, c, u6 + + and l, u7, #0xFFFFFFFF + lsr h, u7, #32 + mul a3, d, l + madd l, d, h, l + adds a3, a3, l, lsl #32 + lsr l, l, #32 + adc b3, h, l + + adds u0, u0, a0 + adcs u1, u1, a1 + adcs u2, u2, a2 + adcs u3, u3, a3 + cset u4, cs + + adds u1, u1, b0 + adcs u2, u2, b1 + adcs u3, u3, b2 + adc u4, u4, b3 + +// Now we have reduced to 5 digits, 2^256 * h + l = [u4,u3,u2,u1,u0] +// Use q = h + 1 as the initial quotient estimate, either right or 1 too big. +// Since q <= 2^33 we do 4294968273 * q = (q<<32) + 977 * q to avoid umulh + + add q, u4, #1 + mul a0, d, q + lsr a1, q, #32 + adds a0, a0, q, lsl #32 + adc a1, xzr, a1 + adds u0, u0, a0 + adcs u1, u1, a1 + adcs u2, u2, xzr + adcs u3, u3, xzr + +// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0] +// So we correct if CF = 0 by subtracting 4294968273, i.e. by +// adding p_256k1 to the "full" answer + + csel c, c, xzr, cc + subs u0, u0, c + sbcs u1, u1, xzr + sbcs u2, u2, xzr + sbc u3, u3, xzr + +// Write back and return + + stp u0, u1, [x0] + stp u2, u3, [x0, #16] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mul_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mul_p256k1_alt.S new file mode 100644 index 00000000000..5ed08227792 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_mul_p256k1_alt.S @@ -0,0 +1,199 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply modulo p_256k1, z := (x * y) mod p_256k1 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_mul_p256k1_alt +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p256k1_alt) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 +#define b0 x7 +#define b1 x8 +#define b2 x9 +#define b3 x10 + +#define l x11 + +#define u0 x12 +#define u1 x13 +#define u2 x14 +#define u3 x15 +#define u4 x16 + +// These alias to the input arguments when no longer needed + +#define u5 a0 +#define u6 a1 +#define u7 a2 + +#define c b0 +#define q b1 +#define h b2 + +S2N_BN_SYMBOL(bignum_mul_p256k1_alt): + +// Load operands and set up row 0 = [u4;...;u0] = a0 * [b3;...;b0] + + ldp a0, a1, [x] + ldp b0, b1, [y] + + mul u0, a0, b0 + umulh u1, a0, b0 + mul l, a0, b1 + umulh u2, a0, b1 + adds u1, u1, l + + ldp b2, b3, [y, #16] + + mul l, a0, b2 + umulh u3, a0, b2 + adcs u2, u2, l + + mul l, a0, b3 + umulh u4, a0, b3 + adcs u3, u3, l + adc u4, u4, xzr + + ldp a2, a3, [x, #16] + +// Row 1 = [u5;...;u0] = [a1;a0] * [b3;...;b0] + + mul l, a1, b0 + adds u1, u1, l + mul l, a1, b1 + adcs u2, u2, l + mul l, a1, b2 + adcs u3, u3, l + mul l, a1, b3 + adcs u4, u4, l + umulh u5, a1, b3 + adc u5, u5, xzr + + umulh l, a1, b0 + adds u2, u2, l + umulh l, a1, b1 + adcs u3, u3, l + umulh l, a1, b2 + adcs u4, u4, l + adc u5, u5, xzr + +// Row 2 = [u6;...;u0] = [a2;a1;a0] * [b3;...;b0] + + mul l, a2, b0 + adds u2, u2, l + mul l, a2, b1 + adcs u3, u3, l + mul l, a2, b2 + adcs u4, u4, l + mul l, a2, b3 + adcs u5, u5, l + umulh u6, a2, b3 + adc u6, u6, xzr + + umulh l, a2, b0 + adds u3, u3, l + umulh l, a2, b1 + adcs u4, u4, l + umulh l, a2, b2 + adcs u5, u5, l + adc u6, u6, xzr + +// Row 3 = [u7;...;u0] = [a3;...a0] * [b3;...;b0] + + mul l, a3, b0 + adds u3, u3, l + mul l, a3, b1 + adcs u4, u4, l + mul l, a3, b2 + adcs u5, u5, l + mul l, a3, b3 + adcs u6, u6, l + umulh u7, a3, b3 + adc u7, u7, xzr + + umulh l, a3, b0 + adds u4, u4, l + umulh l, a3, b1 + adcs u5, u5, l + umulh l, a3, b2 + adcs u6, u6, l + adc u7, u7, xzr + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0] +// and this is == 4294968273 * h + l (mod p_256k1) + + mov c, #977 + orr c, c, #0x100000000 + + mul l, c, u4 + umulh h, c, u4 + adds u0, u0, l + + mul l, c, u5 + umulh u5, c, u5 + adcs u1, u1, l + + mul l, c, u6 + umulh u6, c, u6 + adcs u2, u2, l + + mul l, c, u7 + umulh u7, c, u7 + adcs u3, u3, l + cset u4, cs + + adds u1, u1, h + adcs u2, u2, u5 + adcs u3, u3, u6 + adc u4, u4, u7 + +// Now we have reduced to 5 digits, 2^256 * h + l = [u4,u3,u2,u1,u0] +// Use q = h + 1 as the initial quotient estimate, either right or 1 too big. + + add q, u4, #1 + mul l, c, q + umulh h, c, q + adds u0, u0, l + adcs u1, u1, h + adcs u2, u2, xzr + adcs u3, u3, xzr + +// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0] +// So we correct if CF = 0 by subtracting 4294968273, i.e. by +// adding p_256k1 to the "full" answer + + csel c, c, xzr, cc + subs u0, u0, c + sbcs u1, u1, xzr + sbcs u2, u2, xzr + sbc u3, u3, xzr + +// Write back and return + + stp u0, u1, [x0] + stp u2, u3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_neg_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_neg_p256k1.S new file mode 100644 index 00000000000..d0749fac9e4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_neg_p256k1.S @@ -0,0 +1,65 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_neg_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_neg_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_neg_p256k1) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define p x2 +#define d0 x3 +#define d1 x4 +#define d2 x5 +#define d3 x6 +#define c x7 + +S2N_BN_SYMBOL(bignum_neg_p256k1): + +// Load the 4 digits of x and let c be an OR of all the digits + + ldp d0, d1, [x] + orr c, d0, d1 + ldp d2, d3, [x, #16] + orr c, c, d2 + orr c, c, d3 + +// Turn q into a strict bitmask, and c a masked constant -4294968273, +// computing it in effect as ~4294968272 = ~(2^32 + 976) + + cmp c, xzr + csetm p, ne + mov c, #976 + orr c, c, #0x100000000 + bic c, p, c + +// Now just do [2^256 - 4294968273] - x where the constant is masked + + subs d0, c, d0 + sbcs d1, p, d1 + sbcs d2, p, d2 + sbc d3, p, d3 + +// Write back result and return + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_optneg_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_optneg_p256k1.S new file mode 100644 index 00000000000..86f8dd5e177 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_optneg_p256k1.S @@ -0,0 +1,74 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or +// z := x (if p zero), assuming x reduced +// Inputs p, x[4]; output z[4] +// +// extern void bignum_optneg_p256k1 +// (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = p, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_p256k1) + .text + .balign 4 + +#define z x0 +#define p x1 +#define x x2 + +#define d0 x3 +#define d1 x4 +#define d2 x5 +#define d3 x6 +#define c x7 + +S2N_BN_SYMBOL(bignum_optneg_p256k1): + +// Load the 4 digits of x and let c be an OR of all the digits + + ldp d0, d1, [x] + orr c, d0, d1 + ldp d2, d3, [x, #16] + orr c, c, d2 + orr c, c, d3 + +// Turn p into a strict bitmask. Force it to zero if the input is zero, +// to avoid giving -0 = p_256k1, which is not reduced though correct modulo. + + cmp p, xzr + csetm p, ne + cmp c, xzr + csel p, xzr, p, eq + +// We want z := if p then (2^256 - 4294968273) - x else x +// which is: [if p then ~x else x] - [if p then 4294968272 else 0] + + mov c, #976 + orr c, c, #0x100000000 + and c, c, p + + eor d0, d0, p + subs d0, d0, c + eor d1, d1, p + sbcs d1, d1, xzr + eor d2, d2, p + sbcs d2, d2, xzr + eor d3, d3, p + sbc d3, d3, xzr + +// Write back result and return + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sqr_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sqr_p256k1.S new file mode 100644 index 00000000000..b579acef398 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sqr_p256k1.S @@ -0,0 +1,223 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square modulo p_256k1, z := (x^2) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_sqr_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p256k1) + .text + .balign 4 + +#define z x0 +#define x x1 + +// Variables + +#define u0 x2 +#define u1 x3 +#define u2 x4 +#define u3 x5 +#define u4 x6 +#define u5 x7 +#define u6 x8 +#define u7 x9 + +#define a0 x10 +#define a1 x11 +#define a2 x12 +#define b0 x13 +#define b1 x14 +#define b3 x15 +#define c x16 +#define d x17 + +// Some additional aliases + +#define l u4 +#define h u5 +#define b2 u6 +#define q u4 +#define a3 u7 + +S2N_BN_SYMBOL(bignum_sqr_p256k1): + +// First just a near-clone of bignum_sqr_4_8 to get the square, using +// different registers to collect full product without writeback. + + ldp x10, x11, [x1] + ldp x12, x13, [x1, #16] + umull x2, w10, w10 + lsr x14, x10, #32 + umull x3, w14, w14 + umull x14, w10, w14 + adds x2, x2, x14, lsl #33 + lsr x14, x14, #31 + adc x3, x3, x14 + umull x4, w11, w11 + lsr x14, x11, #32 + umull x5, w14, w14 + umull x14, w11, w14 + mul x15, x10, x11 + umulh x16, x10, x11 + adds x4, x4, x14, lsl #33 + lsr x14, x14, #31 + adc x5, x5, x14 + adds x15, x15, x15 + adcs x16, x16, x16 + adc x5, x5, xzr + adds x3, x3, x15 + adcs x4, x4, x16 + adc x5, x5, xzr + umull x6, w12, w12 + lsr x14, x12, #32 + umull x7, w14, w14 + umull x14, w12, w14 + adds x6, x6, x14, lsl #33 + lsr x14, x14, #31 + adc x7, x7, x14 + umull x8, w13, w13 + lsr x14, x13, #32 + umull x9, w14, w14 + umull x14, w13, w14 + mul x15, x12, x13 + umulh x16, x12, x13 + adds x8, x8, x14, lsl #33 + lsr x14, x14, #31 + adc x9, x9, x14 + adds x15, x15, x15 + adcs x16, x16, x16 + adc x9, x9, xzr + adds x7, x7, x15 + adcs x8, x8, x16 + adc x9, x9, xzr + subs x10, x10, x12 + sbcs x11, x11, x13 + csetm x16, cc + eor x10, x10, x16 + subs x10, x10, x16 + eor x11, x11, x16 + sbc x11, x11, x16 + adds x6, x6, x4 + adcs x7, x7, x5 + adcs x8, x8, xzr + adc x9, x9, xzr + umull x12, w10, w10 + lsr x5, x10, #32 + umull x13, w5, w5 + umull x5, w10, w5 + adds x12, x12, x5, lsl #33 + lsr x5, x5, #31 + adc x13, x13, x5 + umull x15, w11, w11 + lsr x5, x11, #32 + umull x14, w5, w5 + umull x5, w11, w5 + mul x4, x10, x11 + umulh x16, x10, x11 + adds x15, x15, x5, lsl #33 + lsr x5, x5, #31 + adc x14, x14, x5 + adds x4, x4, x4 + adcs x16, x16, x16 + adc x14, x14, xzr + adds x13, x13, x4 + adcs x15, x15, x16 + adc x14, x14, xzr + adds x4, x2, x6 + adcs x5, x3, x7 + adcs x6, x6, x8 + adcs x7, x7, x9 + csetm x16, cc + subs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x15 + sbcs x7, x7, x14 + adcs x8, x8, x16 + adc x9, x9, x16 + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0] +// and this is == 4294968273 * h + l (mod p_256k1) +// Some of the word products are done straightforwardly using mul + umulh +// while others are broken down in a more complicated way as +// (2^32 + 977) * (2^32 * h + l) = 2^64 * h + 2^32 * (d * h + l) + d * l + + mov d, #977 + orr c, d, #0x100000000 + + mul a0, c, u4 + umulh b0, c, u4 + + and l, u5, #0xFFFFFFFF + lsr h, u5, #32 + mul a1, d, l + madd l, d, h, l + adds a1, a1, l, lsl #32 + lsr l, l, #32 + adc b1, h, l + + mul a2, c, u6 + umulh b2, c, u6 + + and l, u7, #0xFFFFFFFF + lsr h, u7, #32 + mul a3, d, l + madd l, d, h, l + adds a3, a3, l, lsl #32 + lsr l, l, #32 + adc b3, h, l + + adds u0, u0, a0 + adcs u1, u1, a1 + adcs u2, u2, a2 + adcs u3, u3, a3 + cset u4, cs + + adds u1, u1, b0 + adcs u2, u2, b1 + adcs u3, u3, b2 + adc u4, u4, b3 + +// Now we have reduced to 5 digits, 2^256 * h + l = [u4,u3,u2,u1,u0] +// Use q = h + 1 as the initial quotient estimate, either right or 1 too big. +// Since q <= 2^33 we do 4294968273 * q = (q<<32) + 977 * q to avoid umulh + + add q, u4, #1 + mul a0, d, q + lsr a1, q, #32 + adds a0, a0, q, lsl #32 + adc a1, xzr, a1 + adds u0, u0, a0 + adcs u1, u1, a1 + adcs u2, u2, xzr + adcs u3, u3, xzr + +// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0] +// So we correct if CF = 0 by subtracting 4294968273, i.e. by +// adding p_256k1 to the "full" answer + + csel c, c, xzr, cc + subs u0, u0, c + sbcs u1, u1, xzr + sbcs u2, u2, xzr + sbc u3, u3, xzr + +// Write back + + stp u0, u1, [x0] + stp u2, u3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sqr_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sqr_p256k1_alt.S new file mode 100644 index 00000000000..3565e48300f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sqr_p256k1_alt.S @@ -0,0 +1,174 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square modulo p_256k1, z := (x^2) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_sqr_p256k1_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p256k1_alt) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 + +#define h x6 +#define l x7 + +#define u0 x8 +#define u1 x9 +#define u2 x10 +#define u3 x11 +#define u4 x12 +#define u5 x13 +#define u6 x14 + +// Just aliases + +#define q a0 +#define c a1 +#define t a2 +#define u7 h + +S2N_BN_SYMBOL(bignum_sqr_p256k1_alt): + +// Load all the elements, set up an initial window [u6;...u1] = [23;03;01] +// and chain in the addition of 02 + 12 + 13 (no carry-out is possible). +// This gives all the "heterogeneous" terms of the squaring ready to double + + ldp a0, a1, [x] + + mul u1, a0, a1 + umulh u2, a0, a1 + + ldp a2, a3, [x, #16] + + mul u3, a0, a3 + umulh u4, a0, a3 + + mul l, a0, a2 + umulh h, a0, a2 + adds u2, u2, l + + adcs u3, u3, h + mul l, a1, a2 + umulh h, a1, a2 + adc h, h, xzr + adds u3, u3, l + + mul u5, a2, a3 + umulh u6, a2, a3 + + adcs u4, u4, h + mul l, a1, a3 + umulh h, a1, a3 + adc h, h, xzr + adds u4, u4, l + + adcs u5, u5, h + adc u6, u6, xzr + +// Now just double it; this simple approach seems to work better than extr + + adds u1, u1, u1 + adcs u2, u2, u2 + adcs u3, u3, u3 + adcs u4, u4, u4 + adcs u5, u5, u5 + adcs u6, u6, u6 + cset u7, cs + +// Add the homogeneous terms 00 + 11 + 22 + 33 + + umulh l, a0, a0 + mul u0, a0, a0 + adds u1, u1, l + + mul l, a1, a1 + adcs u2, u2, l + umulh l, a1, a1 + adcs u3, u3, l + + mul l, a2, a2 + adcs u4, u4, l + umulh l, a2, a2 + adcs u5, u5, l + + mul l, a3, a3 + adcs u6, u6, l + umulh l, a3, a3 + adc u7, u7, l + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [u7,u6,u5,u4] and l = [u3,u2,u1,u0] +// and this is == 4294968273 * h + l (mod p_256k1) + + mov c, #977 + orr c, c, #0x100000000 + + mul l, c, u4 + umulh t, c, u4 + adds u0, u0, l + + mul l, c, u5 + umulh u5, c, u5 + adcs u1, u1, l + + mul l, c, u6 + umulh u6, c, u6 + adcs u2, u2, l + + mul l, c, u7 + umulh u7, c, u7 + adcs u3, u3, l + cset u4, cs + + adds u1, u1, t + adcs u2, u2, u5 + adcs u3, u3, u6 + adc u4, u4, u7 + +// Now we have reduced to 5 digits, 2^256 * h + l = [u4,u3,u2,u1,u0] +// Use q = h + 1 as the initial quotient estimate, either right or 1 too big. + + add q, u4, #1 + mul l, c, q + umulh h, c, q + adds u0, u0, l + adcs u1, u1, h + adcs u2, u2, xzr + adcs u3, u3, xzr + +// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0] +// So we correct if CF = 0 by subtracting 4294968273, i.e. by +// adding p_256k1 to the "full" answer + + csel c, c, xzr, cc + subs u0, u0, c + sbcs u1, u1, xzr + sbcs u2, u2, xzr + sbc u3, u3, xzr + +// Write back and return + + stp u0, u1, [x0] + stp u2, u3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sub_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sub_p256k1.S new file mode 100644 index 00000000000..b291a529358 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_sub_p256k1.S @@ -0,0 +1,68 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Subtract modulo p_256k1, z := (x - y) mod p_256k1 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_sub_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_p256k1) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 +#define c x3 +#define l x4 +#define d0 x5 +#define d1 x6 +#define d2 x7 +#define d3 x8 + +S2N_BN_SYMBOL(bignum_sub_p256k1): + +// First just subtract the numbers as [d3; d2; d1; d0] = x - y, +// with the inverted carry flag meaning CF <=> x >= y. + + ldp d0, d1, [x] + ldp l, c, [y] + subs d0, d0, l + sbcs d1, d1, c + ldp d2, d3, [x, #16] + ldp l, c, [y, #16] + sbcs d2, d2, l + sbcs d3, d3, c + +// Now if x < y we want to add back p_256k1, which staying within 4 digits +// means subtracting 4294968273, since p_256k1 = 2^256 - 4294968273. +// Let c be that constant 4294968273 when x < y, zero otherwise. + + mov l, #977 + orr c, l, #0x100000000 + csel c, c, xzr, cc + +// Now correct by adding masked p_256k1, i.e. subtracting c + + subs d0, d0, c + sbcs d1, d1, xzr + sbcs d2, d2, xzr + sbc d3, d3, xzr + +// Store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_tomont_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_tomont_p256k1.S new file mode 100644 index 00000000000..b9284870de5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_tomont_p256k1.S @@ -0,0 +1,101 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert to Montgomery form z := (2^256 * x) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_tomont_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256k1) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256k1_alt) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define m x2 + +#define d0 x3 +#define d1 x4 +#define d2 x5 +#define d3 x6 +#define a0 x7 +#define a1 x8 + +#define a2 x9 +#define c x9 + +#define a3 x10 +#define h x10 +#define q x10 + +S2N_BN_SYMBOL(bignum_tomont_p256k1): + +S2N_BN_SYMBOL(bignum_tomont_p256k1_alt): + +// Since 2^256 == 4294968273 (mod p_256k1) we more or less just set +// m = 4294968273 then devolve to a near-clone of bignum_cmul_p256k1; +// the logic that q = h + 1 < 2^64 and hence doesn't wrap still holds +// since the multiplier 4294968273 is known to be much less than 2^64. +// We can also re-use the initial constant m instead of re-creating it. + + mov m, #977 + orr m, m, #0x100000000 + +// First do the multiply, straightforwardly to get [h;d3;d2;d1;d0] + + ldp a0, a1, [x] + ldp a2, a3, [x, #16] + mul d0, m, a0 + mul d1, m, a1 + mul d2, m, a2 + mul d3, m, a3 + umulh a0, m, a0 + umulh a1, m, a1 + umulh a2, m, a2 + umulh h, m, a3 + adds d1, d1, a0 + adcs d2, d2, a1 + adcs d3, d3, a2 + adcs h, h, xzr + +// Now the quotient estimate is q = h + 1, and then we do the reduction, +// writing z = [d3;d2;d1;d0], as z' = (2^256 * h + z) - q * p_256k1 = +// (2^256 * h + z) - q * (2^256 - 4294968273) = -2^256 + (z + 4294968273 * q) + + add q, h, #1 + mul a0, q, m + umulh a1, q, m + adds d0, d0, a0 + adcs d1, d1, a1 + adcs d2, d2, xzr + adcs d3, d3, xzr + +// Because of the implicit -2^256, CF means >= 0 so z' is the answer; ~CF +// means z' < 0 so we add p_256k1, which in 4 digits means subtracting m. + + csel m, m, xzr, cc + subs d0, d0, m + sbcs d1, d1, xzr + sbcs d2, d2, xzr + sbcs d3, d3, xzr + +// Finally store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_triple_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_triple_p256k1.S new file mode 100644 index 00000000000..803ca582f06 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/bignum_triple_p256k1.S @@ -0,0 +1,102 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Triple modulo p_256k1, z := (3 * x) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_triple_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The input x can be any 4-digit bignum, not necessarily reduced modulo +// p_256k1, and the result is always fully reduced, z = (3 * x) mod p_256k1. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256k1) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256k1_alt) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define h x6 + +// Slightly offset aliases for the d_i for readability. + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 + +// More aliases for the same thing at different stages + +#define m x6 + +// Other temporary variables + +#define c x7 + +S2N_BN_SYMBOL(bignum_triple_p256k1): +S2N_BN_SYMBOL(bignum_triple_p256k1_alt): + +// Load the inputs + + ldp a0, a1, [x] + ldp a2, a3, [x, #16] + +// First do the multiplication by 3, getting z = [h; d3; ...; d0] + + adds d0, a0, a0, lsl #1 + extr d1, a1, a0, #63 + adcs d1, d1, a1 + extr d2, a2, a1, #63 + adcs d2, d2, a2 + extr d3, a3, a2, #63 + adcs d3, d3, a3 + lsr h, a3, #63 + adc h, h, xzr + +// For this limited range a simple quotient estimate of q = h + 1 works, where +// h = floor(z / 2^256). Then -p_256k1 <= z - q * p_256k1 < p_256k1. + + mov c, #977 + orr c, c, #0x100000000 + madd m, h, c, c + +// Initial subtraction of z - q * p_256k1, actually by adding q * 4294968273. + + adds d0, d0, m + adcs d1, d1, xzr + adcs d2, d2, xzr + adcs d3, d3, xzr + +// With z = 2^256 * h + l, the underlying result z' is actually +// (2^256 * h + l) - q * (2^256 - 4294968273) = (l + q * 4294968273) - 2^256 +// so carry-clear <=> z' is negative. Correct by subtracting in that case. + + csel c, c, xzr, cc + subs d0, d0, c + sbcs d1, d1, xzr + sbcs d2, d2, xzr + sbc d3, d3, xzr + +// Finally store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jadd.S new file mode 100644 index 00000000000..52545b3fd83 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jadd.S @@ -0,0 +1,549 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on SECG curve secp256k1 in Jacobian coordinates +// +// extern void secp256k1_jadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input points p1 and p2 are +// fully reduced mod p_256k1, that both z coordinates are nonzero and +// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents +// the same affine point as". +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jadd) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x19 +#define input_x x20 +#define input_y x21 + +// The magic constant 2^256 - p_256k1 + +#define pconst x17 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_mul_p256k1 except for registers and +// re-use of the pconst register for the constant 4294968273 + +#define mul_p256k1(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + mul x7, x3, x5 __LF \ + umulh x8, x3, x5 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, lo __LF \ + csetm x16, lo __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, lo __LF \ + cinv x16, x16, lo __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + mul x11, x3, x5 __LF \ + umulh x12, x3, x5 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, lo __LF \ + csetm x16, lo __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, lo __LF \ + cinv x16, x16, lo __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, lo __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, lo __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, lo __LF \ + csetm x9, lo __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, lo __LF \ + cinv x9, x9, lo __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x16, #977 __LF \ + mul x3, pconst, x11 __LF \ + umulh x5, pconst, x11 __LF \ + and x15, x12, #0xffffffff __LF \ + lsr x2, x12, #32 __LF \ + mul x4, x16, x15 __LF \ + madd x15, x16, x2, x15 __LF \ + adds x4, x4, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x6, x2, x15 __LF \ + mul x11, pconst, x13 __LF \ + umulh x13, pconst, x13 __LF \ + and x15, x14, #0xffffffff __LF \ + lsr x2, x14, #32 __LF \ + mul x12, x16, x15 __LF \ + madd x15, x16, x2, x15 __LF \ + adds x12, x12, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x14, x2, x15 __LF \ + adds x7, x7, x3 __LF \ + adcs x8, x8, x4 __LF \ + adcs x9, x9, x11 __LF \ + adcs x10, x10, x12 __LF \ + cset x11, hs __LF \ + adds x8, x8, x5 __LF \ + adcs x9, x9, x6 __LF \ + adcs x10, x10, x13 __LF \ + adc x11, x11, x14 __LF \ + add x0, x11, #1 __LF \ + mul x3, x16, x0 __LF \ + lsr x4, x0, #32 __LF \ + adds x3, x3, x0, lsl #32 __LF \ + adc x4, xzr, x4 __LF \ + adds x7, x7, x3 __LF \ + adcs x8, x8, x4 __LF \ + adcs x9, x9, xzr __LF \ + adcs x10, x10, xzr __LF \ + csel x1, pconst, xzr, lo __LF \ + subs x7, x7, x1 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +// Corresponds exactly to bignum_sqr_p256k1 except for +// re-use of the pconst register for the constant 4294968273 + +#define sqr_p256k1(P0,P1) \ + ldp x10, x11, [P1] __LF \ + ldp x12, x13, [P1+16] __LF \ + umull x2, w10, w10 __LF \ + lsr x14, x10, #32 __LF \ + umull x3, w14, w14 __LF \ + umull x14, w10, w14 __LF \ + adds x2, x2, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x3, x3, x14 __LF \ + umull x4, w11, w11 __LF \ + lsr x14, x11, #32 __LF \ + umull x5, w14, w14 __LF \ + umull x14, w11, w14 __LF \ + mul x15, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x4, x4, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x5, x5, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x5, x5, xzr __LF \ + adds x3, x3, x15 __LF \ + adcs x4, x4, x16 __LF \ + adc x5, x5, xzr __LF \ + umull x6, w12, w12 __LF \ + lsr x14, x12, #32 __LF \ + umull x7, w14, w14 __LF \ + umull x14, w12, w14 __LF \ + adds x6, x6, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x7, x7, x14 __LF \ + umull x8, w13, w13 __LF \ + lsr x14, x13, #32 __LF \ + umull x9, w14, w14 __LF \ + umull x14, w13, w14 __LF \ + mul x15, x12, x13 __LF \ + umulh x16, x12, x13 __LF \ + adds x8, x8, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x9, x9, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x9, x9, xzr __LF \ + adds x7, x7, x15 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, xzr __LF \ + subs x10, x10, x12 __LF \ + sbcs x11, x11, x13 __LF \ + csetm x16, lo __LF \ + eor x10, x10, x16 __LF \ + subs x10, x10, x16 __LF \ + eor x11, x11, x16 __LF \ + sbc x11, x11, x16 __LF \ + adds x6, x6, x4 __LF \ + adcs x7, x7, x5 __LF \ + adcs x8, x8, xzr __LF \ + adc x9, x9, xzr __LF \ + umull x12, w10, w10 __LF \ + lsr x5, x10, #32 __LF \ + umull x13, w5, w5 __LF \ + umull x5, w10, w5 __LF \ + adds x12, x12, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x13, x13, x5 __LF \ + umull x15, w11, w11 __LF \ + lsr x5, x11, #32 __LF \ + umull x14, w5, w5 __LF \ + umull x5, w11, w5 __LF \ + mul x4, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x15, x15, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x14, x14, x5 __LF \ + adds x4, x4, x4 __LF \ + adcs x16, x16, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x13, x13, x4 __LF \ + adcs x15, x15, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x4, x2, x6 __LF \ + adcs x5, x3, x7 __LF \ + adcs x6, x6, x8 __LF \ + adcs x7, x7, x9 __LF \ + csetm x16, lo __LF \ + subs x4, x4, x12 __LF \ + sbcs x5, x5, x13 __LF \ + sbcs x6, x6, x15 __LF \ + sbcs x7, x7, x14 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, x16 __LF \ + mov x16, #977 __LF \ + mul x10, pconst, x6 __LF \ + umulh x13, pconst, x6 __LF \ + and x6, x7, #0xffffffff __LF \ + lsr x7, x7, #32 __LF \ + mul x11, x16, x6 __LF \ + madd x6, x16, x7, x6 __LF \ + adds x11, x11, x6, lsl #32 __LF \ + lsr x6, x6, #32 __LF \ + adc x14, x7, x6 __LF \ + mul x12, pconst, x8 __LF \ + umulh x8, pconst, x8 __LF \ + and x6, x9, #0xffffffff __LF \ + lsr x7, x9, #32 __LF \ + mul x9, x16, x6 __LF \ + madd x6, x16, x7, x6 __LF \ + adds x9, x9, x6, lsl #32 __LF \ + lsr x6, x6, #32 __LF \ + adc x15, x7, x6 __LF \ + adds x2, x2, x10 __LF \ + adcs x3, x3, x11 __LF \ + adcs x4, x4, x12 __LF \ + adcs x5, x5, x9 __LF \ + cset x6, hs __LF \ + adds x3, x3, x13 __LF \ + adcs x4, x4, x14 __LF \ + adcs x5, x5, x8 __LF \ + adc x6, x6, x15 __LF \ + add x6, x6, #1 __LF \ + mul x10, x16, x6 __LF \ + lsr x11, x6, #32 __LF \ + adds x10, x10, x6, lsl #32 __LF \ + adc x11, xzr, x11 __LF \ + adds x2, x2, x10 __LF \ + adcs x3, x3, x11 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + csel x16, pconst, xzr, lo __LF \ + subs x2, x2, x16 __LF \ + sbcs x3, x3, xzr __LF \ + sbcs x4, x4, xzr __LF \ + sbc x5, x5, xzr __LF \ + stp x2, x3, [P0] __LF \ + stp x4, x5, [P0+16] + +// Corresponds exactly to bignum_sub_p256k1 + +#define sub_p256k1(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #0x3d1 __LF \ + orr x3, x4, #0x100000000 __LF \ + csel x3, x3, xzr, cc __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +S2N_BN_SYMBOL(secp256k1_jadd): + +// Save registers and make room on stack for temporary variables + + sub sp, sp, NSPACE+32 + stp x19, x20, [sp, NSPACE] + stp x21, x22, [sp, NSPACE+16] + +// Move the input arguments to stable place + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Set up pconst = 4294968273, so p_256k1 = 2^256 - pconst + + mov pconst, #977 + orr pconst, pconst, #0x100000000 + +// Main code, just a sequence of basic field operations + + sqr_p256k1(z1sq,z_1) + sqr_p256k1(z2sq,z_2) + + mul_p256k1(y1a,z_2,y_1) + mul_p256k1(y2a,z_1,y_2) + + mul_p256k1(x2a,z1sq,x_2) + mul_p256k1(x1a,z2sq,x_1) + mul_p256k1(y2a,z1sq,y2a) + mul_p256k1(y1a,z2sq,y1a) + + sub_p256k1(xd,x2a,x1a) + sub_p256k1(yd,y2a,y1a) + + sqr_p256k1(zz,xd) + sqr_p256k1(ww,yd) + + mul_p256k1(zzx1,zz,x1a) + mul_p256k1(zzx2,zz,x2a) + + sub_p256k1(resx,ww,zzx1) + sub_p256k1(t1,zzx2,zzx1) + + mul_p256k1(xd,xd,z_1) + + sub_p256k1(resx,resx,zzx2) + + sub_p256k1(t2,zzx1,resx) + + mul_p256k1(t1,t1,y1a) + mul_p256k1(resz,xd,z_2) + mul_p256k1(t2,yd,t2) + + sub_p256k1(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne + + ldp x4, x5, [z_2] + ldp x6, x7, [z_2+16] + + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne + + cmp x13, x12 + +// Multiplex the outputs accordingly, re-using the z's in registers + + ldp x8, x9, [resz] + csel x8, x0, x8, lo + csel x9, x1, x9, lo + csel x8, x4, x8, hi + csel x9, x5, x9, hi + ldp x10, x11, [resz+16] + csel x10, x2, x10, lo + csel x11, x3, x11, lo + csel x10, x6, x10, hi + csel x11, x7, x11, hi + + ldp x12, x13, [x_1] + ldp x0, x1, [resx] + csel x0, x12, x0, lo + csel x1, x13, x1, lo + ldp x12, x13, [x_2] + csel x0, x12, x0, hi + csel x1, x13, x1, hi + + ldp x12, x13, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x12, x2, lo + csel x3, x13, x3, lo + ldp x12, x13, [x_2+16] + csel x2, x12, x2, hi + csel x3, x13, x3, hi + + ldp x12, x13, [y_1] + ldp x4, x5, [resy] + csel x4, x12, x4, lo + csel x5, x13, x5, lo + ldp x12, x13, [y_2] + csel x4, x12, x4, hi + csel x5, x13, x5, hi + + ldp x12, x13, [y_1+16] + ldp x6, x7, [resy+16] + csel x6, x12, x6, lo + csel x7, x13, x7, lo + ldp x12, x13, [y_2+16] + csel x6, x12, x6, hi + csel x7, x13, x7, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [y_3] + stp x6, x7, [y_3+16] + stp x8, x9, [z_3] + stp x10, x11, [z_3+16] + +// Restore stack and return + + ldp x19, x20, [sp, NSPACE] + ldp x21, x22, [sp, NSPACE+16] + add sp, sp, NSPACE+32 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jadd_alt.S new file mode 100644 index 00000000000..b62656fccb7 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jadd_alt.S @@ -0,0 +1,421 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on SECG curve secp256k1 in Jacobian coordinates +// +// extern void secp256k1_jadd_alt +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input points p1 and p2 are +// fully reduced mod p_256k1, that both z coordinates are nonzero and +// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents +// the same affine point as". +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x15 +#define input_x x16 +#define input_y x17 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_mul_p256k1_alt + +#define mul_p256k1(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x0, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x1, x3, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x1, x1, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x1, x1, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x0, x0, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x1, x1, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x0, x0, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x1, x1, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x0, x0, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x1, x1, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x3d1 __LF \ + orr x7, x7, #0x100000000 __LF \ + mul x11, x7, x1 __LF \ + umulh x9, x7, x1 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x0, x0, x11 __LF \ + cset x1, cs __LF \ + adds x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x0, x0, x4 __LF \ + adc x1, x1, x5 __LF \ + add x8, x1, #0x1 __LF \ + mul x11, x7, x8 __LF \ + umulh x9, x7, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, xzr __LF \ + adcs x0, x0, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x0, x0, xzr __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x0, [P0+16] + +// Corresponds exactly to bignum_sqr_p256k1_alt + +#define sqr_p256k1(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mov x3, #0x3d1 __LF \ + orr x3, x3, #0x100000000 __LF \ + mul x7, x3, x12 __LF \ + umulh x4, x3, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, x3, x13 __LF \ + umulh x13, x3, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, x3, x14 __LF \ + umulh x14, x3, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, x3, x6 __LF \ + umulh x6, x3, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adcs x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + add x2, x12, #0x1 __LF \ + mul x7, x3, x2 __LF \ + umulh x6, x3, x2 __LF \ + adds x8, x8, x7 __LF \ + adcs x9, x9, x6 __LF \ + adcs x10, x10, xzr __LF \ + adcs x11, x11, xzr __LF \ + csel x3, x3, xzr, cc __LF \ + subs x8, x8, x3 __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbc x11, x11, xzr __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Corresponds exactly to bignum_sub_p256k1 + +#define sub_p256k1(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #0x3d1 __LF \ + orr x3, x4, #0x100000000 __LF \ + csel x3, x3, xzr, cc __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +S2N_BN_SYMBOL(secp256k1_jadd_alt): + +// Make room on stack for temporary variables +// Move the input arguments to stable places + + sub sp, sp, NSPACE + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations + + sqr_p256k1(z1sq,z_1) + sqr_p256k1(z2sq,z_2) + + mul_p256k1(y1a,z_2,y_1) + mul_p256k1(y2a,z_1,y_2) + + mul_p256k1(x2a,z1sq,x_2) + mul_p256k1(x1a,z2sq,x_1) + mul_p256k1(y2a,z1sq,y2a) + mul_p256k1(y1a,z2sq,y1a) + + sub_p256k1(xd,x2a,x1a) + sub_p256k1(yd,y2a,y1a) + + sqr_p256k1(zz,xd) + sqr_p256k1(ww,yd) + + mul_p256k1(zzx1,zz,x1a) + mul_p256k1(zzx2,zz,x2a) + + sub_p256k1(resx,ww,zzx1) + sub_p256k1(t1,zzx2,zzx1) + + mul_p256k1(xd,xd,z_1) + + sub_p256k1(resx,resx,zzx2) + + sub_p256k1(t2,zzx1,resx) + + mul_p256k1(t1,t1,y1a) + mul_p256k1(resz,xd,z_2) + mul_p256k1(t2,yd,t2) + + sub_p256k1(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne + + ldp x4, x5, [z_2] + ldp x6, x7, [z_2+16] + + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne + + cmp x13, x12 + +// Multiplex the outputs accordingly, re-using the z's in registers + + ldp x8, x9, [resz] + csel x8, x0, x8, lo + csel x9, x1, x9, lo + csel x8, x4, x8, hi + csel x9, x5, x9, hi + ldp x10, x11, [resz+16] + csel x10, x2, x10, lo + csel x11, x3, x11, lo + csel x10, x6, x10, hi + csel x11, x7, x11, hi + + ldp x12, x13, [x_1] + ldp x0, x1, [resx] + csel x0, x12, x0, lo + csel x1, x13, x1, lo + ldp x12, x13, [x_2] + csel x0, x12, x0, hi + csel x1, x13, x1, hi + + ldp x12, x13, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x12, x2, lo + csel x3, x13, x3, lo + ldp x12, x13, [x_2+16] + csel x2, x12, x2, hi + csel x3, x13, x3, hi + + ldp x12, x13, [y_1] + ldp x4, x5, [resy] + csel x4, x12, x4, lo + csel x5, x13, x5, lo + ldp x12, x13, [y_2] + csel x4, x12, x4, hi + csel x5, x13, x5, hi + + ldp x12, x13, [y_1+16] + ldp x6, x7, [resy+16] + csel x6, x12, x6, lo + csel x7, x13, x7, lo + ldp x12, x13, [y_2+16] + csel x6, x12, x6, hi + csel x7, x13, x7, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [y_3] + stp x6, x7, [y_3+16] + stp x8, x9, [z_3] + stp x10, x11, [z_3+16] + +// Restore stack and return + + add sp, sp, NSPACE + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jdouble.S new file mode 100644 index 00000000000..22b30d022e4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jdouble.S @@ -0,0 +1,890 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on SECG curve secp256k1 in Jacobian coordinates +// +// extern void secp256k1_jdouble +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input point are fully +// reduced mod p_256k1 and that the z coordinate is not zero. +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jdouble) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x19 +#define input_x x20 + +// The magic constant 2^256 - p_256k1 + +#define pconst x17 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries + +#define x_2 sp, #(NUMSIZE*0) +#define y_2 sp, #(NUMSIZE*1) +#define d sp, #(NUMSIZE*2) +#define tmp sp, #(NUMSIZE*3) +#define x_4 sp, #(NUMSIZE*4) +#define y_4 sp, #(NUMSIZE*6) +#define dx2 sp, #(NUMSIZE*8) +#define xy2 sp, #(NUMSIZE*10) + +#define NSPACE #(NUMSIZE*12) + +// Corresponds exactly to bignum_mul_p256k1 except for registers and +// re-use of the pconst register for the constant 4294968273 + +#define mul_p256k1(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + mul x7, x3, x5 __LF \ + umulh x8, x3, x5 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, lo __LF \ + csetm x16, lo __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, lo __LF \ + cinv x16, x16, lo __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + mul x11, x3, x5 __LF \ + umulh x12, x3, x5 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, lo __LF \ + csetm x16, lo __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, lo __LF \ + cinv x16, x16, lo __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, lo __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, lo __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, lo __LF \ + csetm x9, lo __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, lo __LF \ + cinv x9, x9, lo __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x16, #977 __LF \ + mul x3, pconst, x11 __LF \ + umulh x5, pconst, x11 __LF \ + and x15, x12, #0xffffffff __LF \ + lsr x2, x12, #32 __LF \ + mul x4, x16, x15 __LF \ + madd x15, x16, x2, x15 __LF \ + adds x4, x4, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x6, x2, x15 __LF \ + mul x11, pconst, x13 __LF \ + umulh x13, pconst, x13 __LF \ + and x15, x14, #0xffffffff __LF \ + lsr x2, x14, #32 __LF \ + mul x12, x16, x15 __LF \ + madd x15, x16, x2, x15 __LF \ + adds x12, x12, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x14, x2, x15 __LF \ + adds x7, x7, x3 __LF \ + adcs x8, x8, x4 __LF \ + adcs x9, x9, x11 __LF \ + adcs x10, x10, x12 __LF \ + cset x11, hs __LF \ + adds x8, x8, x5 __LF \ + adcs x9, x9, x6 __LF \ + adcs x10, x10, x13 __LF \ + adc x11, x11, x14 __LF \ + add x0, x11, #1 __LF \ + mul x3, x16, x0 __LF \ + lsr x4, x0, #32 __LF \ + adds x3, x3, x0, lsl #32 __LF \ + adc x4, xzr, x4 __LF \ + adds x7, x7, x3 __LF \ + adcs x8, x8, x4 __LF \ + adcs x9, x9, xzr __LF \ + adcs x10, x10, xzr __LF \ + csel x1, pconst, xzr, lo __LF \ + subs x7, x7, x1 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +// Corresponds exactly to bignum_sqr_p256k1 except for +// re-use of the pconst register for the constant 4294968273 + +#define sqr_p256k1(P0,P1) \ + ldp x10, x11, [P1] __LF \ + ldp x12, x13, [P1+16] __LF \ + umull x2, w10, w10 __LF \ + lsr x14, x10, #32 __LF \ + umull x3, w14, w14 __LF \ + umull x14, w10, w14 __LF \ + adds x2, x2, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x3, x3, x14 __LF \ + umull x4, w11, w11 __LF \ + lsr x14, x11, #32 __LF \ + umull x5, w14, w14 __LF \ + umull x14, w11, w14 __LF \ + mul x15, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x4, x4, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x5, x5, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x5, x5, xzr __LF \ + adds x3, x3, x15 __LF \ + adcs x4, x4, x16 __LF \ + adc x5, x5, xzr __LF \ + umull x6, w12, w12 __LF \ + lsr x14, x12, #32 __LF \ + umull x7, w14, w14 __LF \ + umull x14, w12, w14 __LF \ + adds x6, x6, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x7, x7, x14 __LF \ + umull x8, w13, w13 __LF \ + lsr x14, x13, #32 __LF \ + umull x9, w14, w14 __LF \ + umull x14, w13, w14 __LF \ + mul x15, x12, x13 __LF \ + umulh x16, x12, x13 __LF \ + adds x8, x8, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x9, x9, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x9, x9, xzr __LF \ + adds x7, x7, x15 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, xzr __LF \ + subs x10, x10, x12 __LF \ + sbcs x11, x11, x13 __LF \ + csetm x16, lo __LF \ + eor x10, x10, x16 __LF \ + subs x10, x10, x16 __LF \ + eor x11, x11, x16 __LF \ + sbc x11, x11, x16 __LF \ + adds x6, x6, x4 __LF \ + adcs x7, x7, x5 __LF \ + adcs x8, x8, xzr __LF \ + adc x9, x9, xzr __LF \ + umull x12, w10, w10 __LF \ + lsr x5, x10, #32 __LF \ + umull x13, w5, w5 __LF \ + umull x5, w10, w5 __LF \ + adds x12, x12, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x13, x13, x5 __LF \ + umull x15, w11, w11 __LF \ + lsr x5, x11, #32 __LF \ + umull x14, w5, w5 __LF \ + umull x5, w11, w5 __LF \ + mul x4, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x15, x15, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x14, x14, x5 __LF \ + adds x4, x4, x4 __LF \ + adcs x16, x16, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x13, x13, x4 __LF \ + adcs x15, x15, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x4, x2, x6 __LF \ + adcs x5, x3, x7 __LF \ + adcs x6, x6, x8 __LF \ + adcs x7, x7, x9 __LF \ + csetm x16, lo __LF \ + subs x4, x4, x12 __LF \ + sbcs x5, x5, x13 __LF \ + sbcs x6, x6, x15 __LF \ + sbcs x7, x7, x14 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, x16 __LF \ + mov x16, #977 __LF \ + mul x10, pconst, x6 __LF \ + umulh x13, pconst, x6 __LF \ + and x6, x7, #0xffffffff __LF \ + lsr x7, x7, #32 __LF \ + mul x11, x16, x6 __LF \ + madd x6, x16, x7, x6 __LF \ + adds x11, x11, x6, lsl #32 __LF \ + lsr x6, x6, #32 __LF \ + adc x14, x7, x6 __LF \ + mul x12, pconst, x8 __LF \ + umulh x8, pconst, x8 __LF \ + and x6, x9, #0xffffffff __LF \ + lsr x7, x9, #32 __LF \ + mul x9, x16, x6 __LF \ + madd x6, x16, x7, x6 __LF \ + adds x9, x9, x6, lsl #32 __LF \ + lsr x6, x6, #32 __LF \ + adc x15, x7, x6 __LF \ + adds x2, x2, x10 __LF \ + adcs x3, x3, x11 __LF \ + adcs x4, x4, x12 __LF \ + adcs x5, x5, x9 __LF \ + cset x6, hs __LF \ + adds x3, x3, x13 __LF \ + adcs x4, x4, x14 __LF \ + adcs x5, x5, x8 __LF \ + adc x6, x6, x15 __LF \ + add x6, x6, #1 __LF \ + mul x10, x16, x6 __LF \ + lsr x11, x6, #32 __LF \ + adds x10, x10, x6, lsl #32 __LF \ + adc x11, xzr, x11 __LF \ + adds x2, x2, x10 __LF \ + adcs x3, x3, x11 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + csel x16, pconst, xzr, lo __LF \ + subs x2, x2, x16 __LF \ + sbcs x3, x3, xzr __LF \ + sbcs x4, x4, xzr __LF \ + sbc x5, x5, xzr __LF \ + stp x2, x3, [P0] __LF \ + stp x4, x5, [P0+16] + +// Rough versions producing 5-word results + +#define roughmul_p256k1(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + mul x7, x3, x5 __LF \ + umulh x8, x3, x5 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, lo __LF \ + csetm x16, lo __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, lo __LF \ + cinv x16, x16, lo __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + mul x11, x3, x5 __LF \ + umulh x12, x3, x5 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, lo __LF \ + csetm x16, lo __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, lo __LF \ + cinv x16, x16, lo __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, lo __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, lo __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, lo __LF \ + csetm x9, lo __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, lo __LF \ + cinv x9, x9, lo __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x16, #977 __LF \ + mul x3, pconst, x11 __LF \ + umulh x5, pconst, x11 __LF \ + and x15, x12, #0xffffffff __LF \ + lsr x2, x12, #32 __LF \ + mul x4, x16, x15 __LF \ + madd x15, x16, x2, x15 __LF \ + adds x4, x4, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x6, x2, x15 __LF \ + mul x11, pconst, x13 __LF \ + umulh x13, pconst, x13 __LF \ + and x15, x14, #0xffffffff __LF \ + lsr x2, x14, #32 __LF \ + mul x12, x16, x15 __LF \ + madd x15, x16, x2, x15 __LF \ + adds x12, x12, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x14, x2, x15 __LF \ + adds x7, x7, x3 __LF \ + adcs x8, x8, x4 __LF \ + adcs x9, x9, x11 __LF \ + adcs x10, x10, x12 __LF \ + cset x11, hs __LF \ + adds x8, x8, x5 __LF \ + adcs x9, x9, x6 __LF \ + adcs x10, x10, x13 __LF \ + adc x11, x11, x14 __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] __LF \ + str x11, [P0+32] + +#define roughsqr_p256k1(P0,P1) \ + ldp x10, x11, [P1] __LF \ + ldp x12, x13, [P1+16] __LF \ + umull x2, w10, w10 __LF \ + lsr x14, x10, #32 __LF \ + umull x3, w14, w14 __LF \ + umull x14, w10, w14 __LF \ + adds x2, x2, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x3, x3, x14 __LF \ + umull x4, w11, w11 __LF \ + lsr x14, x11, #32 __LF \ + umull x5, w14, w14 __LF \ + umull x14, w11, w14 __LF \ + mul x15, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x4, x4, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x5, x5, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x5, x5, xzr __LF \ + adds x3, x3, x15 __LF \ + adcs x4, x4, x16 __LF \ + adc x5, x5, xzr __LF \ + umull x6, w12, w12 __LF \ + lsr x14, x12, #32 __LF \ + umull x7, w14, w14 __LF \ + umull x14, w12, w14 __LF \ + adds x6, x6, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x7, x7, x14 __LF \ + umull x8, w13, w13 __LF \ + lsr x14, x13, #32 __LF \ + umull x9, w14, w14 __LF \ + umull x14, w13, w14 __LF \ + mul x15, x12, x13 __LF \ + umulh x16, x12, x13 __LF \ + adds x8, x8, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x9, x9, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x9, x9, xzr __LF \ + adds x7, x7, x15 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, xzr __LF \ + subs x10, x10, x12 __LF \ + sbcs x11, x11, x13 __LF \ + csetm x16, lo __LF \ + eor x10, x10, x16 __LF \ + subs x10, x10, x16 __LF \ + eor x11, x11, x16 __LF \ + sbc x11, x11, x16 __LF \ + adds x6, x6, x4 __LF \ + adcs x7, x7, x5 __LF \ + adcs x8, x8, xzr __LF \ + adc x9, x9, xzr __LF \ + umull x12, w10, w10 __LF \ + lsr x5, x10, #32 __LF \ + umull x13, w5, w5 __LF \ + umull x5, w10, w5 __LF \ + adds x12, x12, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x13, x13, x5 __LF \ + umull x15, w11, w11 __LF \ + lsr x5, x11, #32 __LF \ + umull x14, w5, w5 __LF \ + umull x5, w11, w5 __LF \ + mul x4, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x15, x15, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x14, x14, x5 __LF \ + adds x4, x4, x4 __LF \ + adcs x16, x16, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x13, x13, x4 __LF \ + adcs x15, x15, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x4, x2, x6 __LF \ + adcs x5, x3, x7 __LF \ + adcs x6, x6, x8 __LF \ + adcs x7, x7, x9 __LF \ + csetm x16, lo __LF \ + subs x4, x4, x12 __LF \ + sbcs x5, x5, x13 __LF \ + sbcs x6, x6, x15 __LF \ + sbcs x7, x7, x14 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, x16 __LF \ + mov x16, #977 __LF \ + mul x10, pconst, x6 __LF \ + umulh x13, pconst, x6 __LF \ + and x6, x7, #0xffffffff __LF \ + lsr x7, x7, #32 __LF \ + mul x11, x16, x6 __LF \ + madd x6, x16, x7, x6 __LF \ + adds x11, x11, x6, lsl #32 __LF \ + lsr x6, x6, #32 __LF \ + adc x14, x7, x6 __LF \ + mul x12, pconst, x8 __LF \ + umulh x8, pconst, x8 __LF \ + and x6, x9, #0xffffffff __LF \ + lsr x7, x9, #32 __LF \ + mul x9, x16, x6 __LF \ + madd x6, x16, x7, x6 __LF \ + adds x9, x9, x6, lsl #32 __LF \ + lsr x6, x6, #32 __LF \ + adc x15, x7, x6 __LF \ + adds x2, x2, x10 __LF \ + adcs x3, x3, x11 __LF \ + adcs x4, x4, x12 __LF \ + adcs x5, x5, x9 __LF \ + cset x6, hs __LF \ + adds x3, x3, x13 __LF \ + adcs x4, x4, x14 __LF \ + adcs x5, x5, x8 __LF \ + adc x6, x6, x15 __LF \ + stp x2, x3, [P0] __LF \ + stp x4, x5, [P0+16] __LF \ + str x6, [P0+32] + +// Weak doubling operation, staying in 4 digits but not in general +// fully normalizing modulo p_256k1 + +#define weakdouble_p256k1(P0,P1) \ + ldp x1, x2, [P1] __LF \ + lsl x0, x1, #1 __LF \ + ldp x3, x4, [P1+16] __LF \ + ands xzr, x4, #0x8000000000000000 __LF \ + csel x5, pconst, xzr, ne __LF \ + extr x1, x2, x1, #63 __LF \ + adds x0, x0, x5 __LF \ + extr x2, x3, x2, #63 __LF \ + adcs x1, x1, xzr __LF \ + extr x3, x4, x3, #63 __LF \ + adcs x2, x2, xzr __LF \ + stp x0, x1, [P0] __LF \ + adc x3, x3, xzr __LF \ + stp x2, x3, [P0+16] + +// P0 = C * P1 - D * P2 with 5-word inputs P1 and P2 +// Only used here with C = 12, D = 9, but could be used more generally. +// We start with (2^40 * 2^256 + C * P1) - (D * P2 + 2^40 * k) +// where p_256k1 = 2^256 - k (so k = 4294968273) + +#define cmsub_p256k1(P0,C,P1,D,P2) \ + mov x10, C __LF \ + ldp x4, x5, [P1] __LF \ + mul x0, x4, x10 __LF \ + mul x1, x5, x10 __LF \ + ldp x6, x7, [P1+16] __LF \ + mul x2, x6, x10 __LF \ + mul x3, x7, x10 __LF \ + ldr x13, [P1+32] __LF \ + umulh x4, x4, x10 __LF \ + adds x1, x1, x4 __LF \ + umulh x5, x5, x10 __LF \ + adcs x2, x2, x5 __LF \ + umulh x6, x6, x10 __LF \ + adcs x3, x3, x6 __LF \ + umulh x4, x7, x10 __LF \ + mul x13, x13, x10 __LF \ + adc x9, x4, x13 __LF \ + orr x9, x9, #0x10000000000 __LF \ + /* [x9; x3;x2;x1;x0] = 2^40 * 2^256 + C * P1 */ \ + mov x10, D __LF \ + ldp x13, x14, [P2] __LF \ + mul x5, x14, x10 __LF \ + umulh x6, x14, x10 __LF \ + adds x5, x5, pconst, lsr #24 __LF \ + adc x6, x6, xzr __LF \ + mul x4, x13, x10 __LF \ + adds x4, x4, pconst, lsl #40 __LF \ + umulh x13, x13, x10 __LF \ + adcs x5, x5, x13 __LF \ + ldp x13, x14, [P2+16] __LF \ + mul x12, x13, x10 __LF \ + umulh x7, x13, x10 __LF \ + ldr x13, [P2+32] __LF \ + adcs x6, x6, x12 __LF \ + mul x12, x14, x10 __LF \ + umulh x8, x14, x10 __LF \ + mul x13, x13, x10 __LF \ + adcs x7, x7, x12 __LF \ + adc x8, x8, x13 __LF \ + /* [x8; x7;x6;x5;x4] = D * P2 + 2^40 * k */ \ + subs x0, x0, x4 __LF \ + sbcs x1, x1, x5 __LF \ + sbcs x2, x2, x6 __LF \ + sbcs x3, x3, x7 __LF \ + sbc x4, x9, x8 __LF \ + /* [x4; x3;x2;x1;x0] = 2^40*p_256k1+result */ \ + add x10, x4, #1 __LF \ + /* (h + 1) is quotient estimate */ \ + mul x4, pconst, x10 __LF \ + umulh x5, pconst, x10 __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + adcs x2, x2, xzr __LF \ + adcs x3, x3, xzr __LF \ + csel x11, pconst, xzr, cc __LF \ + /* If un-correction needed */ \ + subs x0, x0, x11 __LF \ + sbcs x1, x1, xzr __LF \ + stp x0, x1, [P0] __LF \ + sbcs x2, x2, xzr __LF \ + sbc x3, x3, xzr __LF \ + stp x2, x3, [P0+16] + +// P0 = 3 * P1 - 8 * P2 with 5-digit P1 and P2 +// We start with (2^40 * 2^256 + 3 * P1) - (8 * P2 + 2^40 * k) +// where p_256k1 = 2^256 - k (so k = 4294968273) + +#define cmsub38_p256k1(P0,P1,P2) \ + mov x10, #3 __LF \ + ldp x4, x5, [P1] __LF \ + mul x0, x4, x10 __LF \ + mul x1, x5, x10 __LF \ + ldp x6, x7, [P1+16] __LF \ + mul x2, x6, x10 __LF \ + mul x3, x7, x10 __LF \ + ldr x13, [P1+32] __LF \ + umulh x4, x4, x10 __LF \ + adds x1, x1, x4 __LF \ + umulh x5, x5, x10 __LF \ + adcs x2, x2, x5 __LF \ + umulh x6, x6, x10 __LF \ + adcs x3, x3, x6 __LF \ + umulh x4, x7, x10 __LF \ + mul x13, x13, x10 __LF \ + adc x9, x4, x13 __LF \ + orr x9, x9, #0x10000000000 __LF \ + /* [x9; x3;x2;x1;x0] = 2^40 * 2^256 + 3 * P1 */ \ + lsl x12, pconst, #40 __LF \ + ldp x13, x14, [P2] __LF \ + lsl x4, x13, #3 __LF \ + adds x4, x4, x12 __LF \ + extr x5, x14, x13, #61 __LF \ + lsr x12, pconst, #24 __LF \ + adcs x5, x5, x12 __LF \ + ldp x11, x12, [P2+16] __LF \ + extr x6, x11, x14, #61 __LF \ + adcs x6, x6, xzr __LF \ + ldr x13, [P2+32] __LF \ + extr x7, x12, x11, #61 __LF \ + adcs x7, x7, xzr __LF \ + extr x8, x13, x12, #61 __LF \ + adc x8, x8, xzr __LF \ + /* [x8; x7;x6;x5;x4] = 8 * P2 + 2^40 * k */ \ + subs x0, x0, x4 __LF \ + sbcs x1, x1, x5 __LF \ + sbcs x2, x2, x6 __LF \ + sbcs x3, x3, x7 __LF \ + sbc x4, x9, x8 __LF \ + /* [x4; x3;x2;x1;x0] = 2^40*p_256k1+result */ \ + add x10, x4, #1 __LF \ + /* (h + 1) is quotient estimate */ \ + mul x4, pconst, x10 __LF \ + umulh x5, pconst, x10 __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + adcs x2, x2, xzr __LF \ + adcs x3, x3, xzr __LF \ + csel x11, pconst, xzr, cc __LF \ + /* If un-correction needed */ \ + subs x0, x0, x11 __LF \ + sbcs x1, x1, xzr __LF \ + stp x0, x1, [P0] __LF \ + sbcs x2, x2, xzr __LF \ + sbc x3, x3, xzr __LF \ + stp x2, x3, [P0+16] + +// P0 = 4 * P1 - P2 with 5-digit P1, 4-digit P2 and result. +// This is done by direct subtraction of P2 since the method +// in bignum_cmul_p256k1 etc. for quotient estimation still +// works when the value to be reduced is negative, as +// long as it is > -p_256k1, which is the case here. + +#define cmsub41_p256k1(P0,P1,P2) \ + ldp x1, x2, [P1] __LF \ + lsl x0, x1, #2 __LF \ + ldp x6, x7, [P2] __LF \ + subs x0, x0, x6 __LF \ + extr x1, x2, x1, #62 __LF \ + sbcs x1, x1, x7 __LF \ + ldp x3, x4, [P1+16] __LF \ + extr x2, x3, x2, #62 __LF \ + ldp x6, x7, [P2+16] __LF \ + sbcs x2, x2, x6 __LF \ + extr x3, x4, x3, #62 __LF \ + sbcs x3, x3, x7 __LF \ + ldr x5, [P1+32] __LF \ + extr x4, x5, x4, #62 __LF \ + sbc x4, x4, xzr __LF \ + add x5, x4, #1 __LF \ + /* (h + 1) is quotient estimate */ \ + mul x4, pconst, x5 __LF \ + adds x0, x0, x4 __LF \ + umulh x5, pconst, x5 __LF \ + adcs x1, x1, x5 __LF \ + adcs x2, x2, xzr __LF \ + adcs x3, x3, xzr __LF \ + csel x4, pconst, xzr, cc __LF \ + /* If un-correction needed */ \ + subs x0, x0, x4 __LF \ + sbcs x1, x1, xzr __LF \ + stp x0, x1, [P0] __LF \ + sbcs x2, x2, xzr __LF \ + sbc x3, x3, xzr __LF \ + stp x2, x3, [P0+16] + +S2N_BN_SYMBOL(secp256k1_jdouble): + +// Save registers and make room on stack for temporary variables + + sub sp, sp, NSPACE+16 + stp x19, x20, [sp, NSPACE] + +// Move the input arguments to stable place + + mov input_z, x0 + mov input_x, x1 + +// Set up pconst = 4294968273, so p_256k1 = 2^256 - pconst + + mov pconst, #977 + orr pconst, pconst, #0x100000000 + +// Main sequence of operations + + // y_2 = y^2 + + sqr_p256k1(y_2,y_1) + + // x_2 = x^2 + + sqr_p256k1(x_2,x_1) + + // tmp = 2 * y_1 (in 4 words but not fully normalized) + + weakdouble_p256k1(tmp,y_1) + + // xy2 = x * y^2 (5-digit partially reduced) + // x_4 = x^4 (5-digit partially reduced) + + roughmul_p256k1(xy2,x_1,y_2) + roughsqr_p256k1(x_4,x_2) + + // z_3 = 2 * y_1 * z_1 + + mul_p256k1(z_3,z_1,tmp) + + // d = 12 * xy2 - 9 * x_4 + + cmsub_p256k1(d,12,xy2,9,x_4) + + // y4 = y2^2 (5-digit partially reduced) + + roughsqr_p256k1(y_4,y_2) + + // dx2 = d * x_2 (5-digit partially reduced) + + roughmul_p256k1(dx2,x_2,d) + + // x_3 = 4 * xy2 - d + + cmsub41_p256k1(x_3,xy2,d) + + // y_3 = 3 * dx2 - 8 * y_4 + + cmsub38_p256k1(y_3,dx2,y_4) + +// Restore stack and return + + ldp x19, x20, [sp, NSPACE] + add sp, sp, NSPACE+16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jdouble_alt.S new file mode 100644 index 00000000000..4af92a167a7 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jdouble_alt.S @@ -0,0 +1,660 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on SECG curve secp256k1 in Jacobian coordinates +// +// extern void secp256k1_jdouble_alt +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input point are fully +// reduced mod p_256k1 and that the z coordinate is not zero. +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jdouble_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x15 +#define input_x x16 + +// The magic constant 2^256 - p_256k1 + +#define pconst x17 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries + +#define x_2 sp, #(NUMSIZE*0) +#define y_2 sp, #(NUMSIZE*1) +#define d sp, #(NUMSIZE*2) +#define tmp sp, #(NUMSIZE*3) +#define x_4 sp, #(NUMSIZE*4) +#define y_4 sp, #(NUMSIZE*6) +#define dx2 sp, #(NUMSIZE*8) +#define xy2 sp, #(NUMSIZE*10) + +#define NSPACE #(NUMSIZE*12) + +// Corresponds exactly to bignum_mul_p256k1_alt except for +// re-use of the pconst register for the constant 4294968273 + +#define mul_p256k1(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x0, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x1, x3, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x1, x1, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x1, x1, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x0, x0, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x1, x1, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x0, x0, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x1, x1, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x0, x0, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x1, x1, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mul x11, pconst, x1 __LF \ + umulh x9, pconst, x1 __LF \ + adds x12, x12, x11 __LF \ + mul x11, pconst, x3 __LF \ + umulh x3, pconst, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, pconst, x4 __LF \ + umulh x4, pconst, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, pconst, x5 __LF \ + umulh x5, pconst, x5 __LF \ + adcs x0, x0, x11 __LF \ + cset x1, cs __LF \ + adds x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x0, x0, x4 __LF \ + adc x1, x1, x5 __LF \ + add x8, x1, #0x1 __LF \ + mul x11, pconst, x8 __LF \ + umulh x9, pconst, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, xzr __LF \ + adcs x0, x0, xzr __LF \ + csel x7, pconst, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x0, x0, xzr __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x0, [P0+16] + +// Corresponds exactly to bignum_sqr_p256k1_alt except for +// re-use of the pconst register for the constant 4294968273 + +#define sqr_p256k1(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mul x7, pconst, x12 __LF \ + umulh x4, pconst, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, pconst, x13 __LF \ + umulh x13, pconst, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, pconst, x14 __LF \ + umulh x14, pconst, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, pconst, x6 __LF \ + umulh x6, pconst, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adcs x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + add x2, x12, #0x1 __LF \ + mul x7, pconst, x2 __LF \ + umulh x6, pconst, x2 __LF \ + adds x8, x8, x7 __LF \ + adcs x9, x9, x6 __LF \ + adcs x10, x10, xzr __LF \ + adcs x11, x11, xzr __LF \ + csel x3, pconst, xzr, cc __LF \ + subs x8, x8, x3 __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbc x11, x11, xzr __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Rough versions producing 5-word results + +#define roughmul_p256k1(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x0, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x1, x3, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x1, x1, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x1, x1, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x0, x0, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x1, x1, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x0, x0, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x1, x1, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x0, x0, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x1, x1, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mul x11, pconst, x1 __LF \ + umulh x9, pconst, x1 __LF \ + adds x12, x12, x11 __LF \ + mul x11, pconst, x3 __LF \ + umulh x3, pconst, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, pconst, x4 __LF \ + umulh x4, pconst, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, pconst, x5 __LF \ + umulh x5, pconst, x5 __LF \ + adcs x0, x0, x11 __LF \ + cset x1, cs __LF \ + adds x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x0, x0, x4 __LF \ + adc x1, x1, x5 __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x0, [P0+16] __LF \ + str x1, [P0+32] + +#define roughsqr_p256k1(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mul x7, pconst, x12 __LF \ + umulh x4, pconst, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, pconst, x13 __LF \ + umulh x13, pconst, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, pconst, x14 __LF \ + umulh x14, pconst, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, pconst, x6 __LF \ + umulh x6, pconst, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adcs x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] __LF \ + str x12, [P0+32] + +// Weak doubling operation, staying in 4 digits but not in general +// fully normalizing modulo p_256k1 + +#define weakdouble_p256k1(P0,P1) \ + ldp x1, x2, [P1] __LF \ + lsl x0, x1, #1 __LF \ + ldp x3, x4, [P1+16] __LF \ + ands xzr, x4, #0x8000000000000000 __LF \ + csel x5, pconst, xzr, ne __LF \ + extr x1, x2, x1, #63 __LF \ + adds x0, x0, x5 __LF \ + extr x2, x3, x2, #63 __LF \ + adcs x1, x1, xzr __LF \ + extr x3, x4, x3, #63 __LF \ + adcs x2, x2, xzr __LF \ + stp x0, x1, [P0] __LF \ + adc x3, x3, xzr __LF \ + stp x2, x3, [P0+16] + +// P0 = C * P1 - D * P2 with 5-word inputs P1 and P2 +// Only used here with C = 12, D = 9, but could be used more generally. +// We start with (2^40 * 2^256 + C * P1) - (D * P2 + 2^40 * k) +// where p_256k1 = 2^256 - k (so k = 4294968273) + +#define cmsub_p256k1(P0,C,P1,D,P2) \ + mov x10, C __LF \ + ldp x4, x5, [P1] __LF \ + mul x0, x4, x10 __LF \ + mul x1, x5, x10 __LF \ + ldp x6, x7, [P1+16] __LF \ + mul x2, x6, x10 __LF \ + mul x3, x7, x10 __LF \ + ldr x13, [P1+32] __LF \ + umulh x4, x4, x10 __LF \ + adds x1, x1, x4 __LF \ + umulh x5, x5, x10 __LF \ + adcs x2, x2, x5 __LF \ + umulh x6, x6, x10 __LF \ + adcs x3, x3, x6 __LF \ + umulh x4, x7, x10 __LF \ + mul x13, x13, x10 __LF \ + adc x9, x4, x13 __LF \ + orr x9, x9, #0x10000000000 __LF \ + /* [x9; x3;x2;x1;x0] = 2^40 * 2^256 + C * P1 */ \ + mov x10, D __LF \ + ldp x13, x14, [P2] __LF \ + mul x5, x14, x10 __LF \ + umulh x6, x14, x10 __LF \ + adds x5, x5, pconst, lsr #24 __LF \ + adc x6, x6, xzr __LF \ + mul x4, x13, x10 __LF \ + adds x4, x4, pconst, lsl #40 __LF \ + umulh x13, x13, x10 __LF \ + adcs x5, x5, x13 __LF \ + ldp x13, x14, [P2+16] __LF \ + mul x12, x13, x10 __LF \ + umulh x7, x13, x10 __LF \ + ldr x13, [P2+32] __LF \ + adcs x6, x6, x12 __LF \ + mul x12, x14, x10 __LF \ + umulh x8, x14, x10 __LF \ + mul x13, x13, x10 __LF \ + adcs x7, x7, x12 __LF \ + adc x8, x8, x13 __LF \ + /* [x8; x7;x6;x5;x4] = D * P2 + 2^40 * k */ \ + subs x0, x0, x4 __LF \ + sbcs x1, x1, x5 __LF \ + sbcs x2, x2, x6 __LF \ + sbcs x3, x3, x7 __LF \ + sbc x4, x9, x8 __LF \ + /* [x4; x3;x2;x1;x0] = 2^40*p_256k1+result */ \ + add x10, x4, #1 __LF \ + /* (h + 1) is quotient estimate */ \ + mul x4, pconst, x10 __LF \ + umulh x5, pconst, x10 __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + adcs x2, x2, xzr __LF \ + adcs x3, x3, xzr __LF \ + csel x11, pconst, xzr, cc __LF \ + /* If un-correction needed */ \ + subs x0, x0, x11 __LF \ + sbcs x1, x1, xzr __LF \ + stp x0, x1, [P0] __LF \ + sbcs x2, x2, xzr __LF \ + sbc x3, x3, xzr __LF \ + stp x2, x3, [P0+16] + +// P0 = 3 * P1 - 8 * P2 with 5-digit P1 and P2 +// We start with (2^40 * 2^256 + 3 * P1) - (8 * P2 + 2^40 * k) +// where p_256k1 = 2^256 - k (so k = 4294968273) + +#define cmsub38_p256k1(P0,P1,P2) \ + mov x10, #3 __LF \ + ldp x4, x5, [P1] __LF \ + mul x0, x4, x10 __LF \ + mul x1, x5, x10 __LF \ + ldp x6, x7, [P1+16] __LF \ + mul x2, x6, x10 __LF \ + mul x3, x7, x10 __LF \ + ldr x13, [P1+32] __LF \ + umulh x4, x4, x10 __LF \ + adds x1, x1, x4 __LF \ + umulh x5, x5, x10 __LF \ + adcs x2, x2, x5 __LF \ + umulh x6, x6, x10 __LF \ + adcs x3, x3, x6 __LF \ + umulh x4, x7, x10 __LF \ + mul x13, x13, x10 __LF \ + adc x9, x4, x13 __LF \ + orr x9, x9, #0x10000000000 __LF \ + /* [x9; x3;x2;x1;x0] = 2^40 * 2^256 + 3 * P1 */ \ + lsl x12, pconst, #40 __LF \ + ldp x13, x14, [P2] __LF \ + lsl x4, x13, #3 __LF \ + adds x4, x4, x12 __LF \ + extr x5, x14, x13, #61 __LF \ + lsr x12, pconst, #24 __LF \ + adcs x5, x5, x12 __LF \ + ldp x11, x12, [P2+16] __LF \ + extr x6, x11, x14, #61 __LF \ + adcs x6, x6, xzr __LF \ + ldr x13, [P2+32] __LF \ + extr x7, x12, x11, #61 __LF \ + adcs x7, x7, xzr __LF \ + extr x8, x13, x12, #61 __LF \ + adc x8, x8, xzr __LF \ + /* [x8; x7;x6;x5;x4] = 8 * P2 + 2^40 * k */ \ + subs x0, x0, x4 __LF \ + sbcs x1, x1, x5 __LF \ + sbcs x2, x2, x6 __LF \ + sbcs x3, x3, x7 __LF \ + sbc x4, x9, x8 __LF \ + /* [x4; x3;x2;x1;x0] = 2^40*p_256k1+result */ \ + add x10, x4, #1 __LF \ + /* (h + 1) is quotient estimate */ \ + mul x4, pconst, x10 __LF \ + umulh x5, pconst, x10 __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x5 __LF \ + adcs x2, x2, xzr __LF \ + adcs x3, x3, xzr __LF \ + csel x11, pconst, xzr, cc __LF \ + /* If un-correction needed */ \ + subs x0, x0, x11 __LF \ + sbcs x1, x1, xzr __LF \ + stp x0, x1, [P0] __LF \ + sbcs x2, x2, xzr __LF \ + sbc x3, x3, xzr __LF \ + stp x2, x3, [P0+16] + +// P0 = 4 * P1 - P2 with 5-digit P1, 4-digit P2 and result. +// This is done by direct subtraction of P2 since the method +// in bignum_cmul_p256k1 etc. for quotient estimation still +// works when the value to be reduced is negative, as +// long as it is > -p_256k1, which is the case here. + +#define cmsub41_p256k1(P0,P1,P2) \ + ldp x1, x2, [P1] __LF \ + lsl x0, x1, #2 __LF \ + ldp x6, x7, [P2] __LF \ + subs x0, x0, x6 __LF \ + extr x1, x2, x1, #62 __LF \ + sbcs x1, x1, x7 __LF \ + ldp x3, x4, [P1+16] __LF \ + extr x2, x3, x2, #62 __LF \ + ldp x6, x7, [P2+16] __LF \ + sbcs x2, x2, x6 __LF \ + extr x3, x4, x3, #62 __LF \ + sbcs x3, x3, x7 __LF \ + ldr x5, [P1+32] __LF \ + extr x4, x5, x4, #62 __LF \ + sbc x4, x4, xzr __LF \ + add x5, x4, #1 __LF \ + /* (h + 1) is quotient estimate */ \ + mul x4, pconst, x5 __LF \ + adds x0, x0, x4 __LF \ + umulh x5, pconst, x5 __LF \ + adcs x1, x1, x5 __LF \ + adcs x2, x2, xzr __LF \ + adcs x3, x3, xzr __LF \ + csel x4, pconst, xzr, cc __LF \ + /* If un-correction needed */ \ + subs x0, x0, x4 __LF \ + sbcs x1, x1, xzr __LF \ + stp x0, x1, [P0] __LF \ + sbcs x2, x2, xzr __LF \ + sbc x3, x3, xzr __LF \ + stp x2, x3, [P0+16] + +S2N_BN_SYMBOL(secp256k1_jdouble_alt): + +// Make room on stack for temp registers + + sub sp, sp, NSPACE + +// Move the input arguments to stable place + + mov input_z, x0 + mov input_x, x1 + +// Set up pconst = 4294968273, so p_256k1 = 2^256 - pconst + + mov pconst, #977 + orr pconst, pconst, #0x100000000 + +// Main sequence of operations + + // y_2 = y^2 + + sqr_p256k1(y_2,y_1) + + // x_2 = x^2 + + sqr_p256k1(x_2,x_1) + + // tmp = 2 * y_1 (in 4 words but not fully normalized) + + weakdouble_p256k1(tmp,y_1) + + // xy2 = x * y^2 (5-digit partially reduced) + // x_4 = x^4 (5-digit partially reduced) + + roughmul_p256k1(xy2,x_1,y_2) + roughsqr_p256k1(x_4,x_2) + + // z_3 = 2 * y_1 * z_1 + + mul_p256k1(z_3,z_1,tmp) + + // d = 12 * xy2 - 9 * x_4 + + cmsub_p256k1(d,12,xy2,9,x_4) + + // y4 = y2^2 (5-digit partially reduced) + + roughsqr_p256k1(y_4,y_2) + + // dx2 = d * x_2 (5-digit partially reduced) + + roughmul_p256k1(dx2,x_2,d) + + // x_3 = 4 * xy2 - d + + cmsub41_p256k1(x_3,xy2,d) + + // y_3 = 3 * dx2 - 8 * y_4 + + cmsub38_p256k1(y_3,dx2,y_4) + +// Restore stack and return + + add sp, sp, NSPACE + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jmixadd.S new file mode 100644 index 00000000000..660a7ebb18b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jmixadd.S @@ -0,0 +1,507 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on SECG curve secp256k1 in Jacobian coordinates +// +// extern void secp256k1_jmixadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. It is assumed that +// all the coordinates of the input points p1 and p2 are fully reduced +// mod p_256k1, that the z coordinate of p1 is nonzero and that neither +// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine +// point as". +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jmixadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jmixadd) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x19 +#define input_x x20 +#define input_y x21 + +// The magic constant 2^256 - p_256k1 + +#define pconst x17 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_mul_p256k1 except for registers and +// re-use of the pconst register for the constant 4294968273 + +#define mul_p256k1(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P2] __LF \ + mul x7, x3, x5 __LF \ + umulh x8, x3, x5 __LF \ + mul x9, x4, x6 __LF \ + umulh x10, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, lo __LF \ + csetm x16, lo __LF \ + adds x9, x9, x8 __LF \ + adc x10, x10, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, lo __LF \ + cinv x16, x16, lo __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x8, x7, x9 __LF \ + adcs x9, x9, x10 __LF \ + adc x10, x10, xzr __LF \ + cmn x16, #1 __LF \ + eor x15, x15, x16 __LF \ + adcs x8, x15, x8 __LF \ + eor x3, x3, x16 __LF \ + adcs x9, x3, x9 __LF \ + adc x10, x10, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x5, x6, [P2+16] __LF \ + mul x11, x3, x5 __LF \ + umulh x12, x3, x5 __LF \ + mul x13, x4, x6 __LF \ + umulh x14, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, lo __LF \ + csetm x16, lo __LF \ + adds x13, x13, x12 __LF \ + adc x14, x14, xzr __LF \ + subs x3, x5, x6 __LF \ + cneg x3, x3, lo __LF \ + cinv x16, x16, lo __LF \ + mul x15, x4, x3 __LF \ + umulh x3, x4, x3 __LF \ + adds x12, x11, x13 __LF \ + adcs x13, x13, x14 __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #1 __LF \ + eor x15, x15, x16 __LF \ + adcs x12, x15, x12 __LF \ + eor x3, x3, x16 __LF \ + adcs x13, x3, x13 __LF \ + adc x14, x14, x16 __LF \ + ldp x3, x4, [P1+16] __LF \ + ldp x15, x16, [P1] __LF \ + subs x3, x3, x15 __LF \ + sbcs x4, x4, x16 __LF \ + csetm x16, lo __LF \ + ldp x15, x0, [P2] __LF \ + subs x5, x15, x5 __LF \ + sbcs x6, x0, x6 __LF \ + csetm x0, lo __LF \ + eor x3, x3, x16 __LF \ + subs x3, x3, x16 __LF \ + eor x4, x4, x16 __LF \ + sbc x4, x4, x16 __LF \ + eor x5, x5, x0 __LF \ + subs x5, x5, x0 __LF \ + eor x6, x6, x0 __LF \ + sbc x6, x6, x0 __LF \ + eor x16, x0, x16 __LF \ + adds x11, x11, x9 __LF \ + adcs x12, x12, x10 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + mul x2, x3, x5 __LF \ + umulh x0, x3, x5 __LF \ + mul x15, x4, x6 __LF \ + umulh x1, x4, x6 __LF \ + subs x4, x4, x3 __LF \ + cneg x4, x4, lo __LF \ + csetm x9, lo __LF \ + adds x15, x15, x0 __LF \ + adc x1, x1, xzr __LF \ + subs x6, x5, x6 __LF \ + cneg x6, x6, lo __LF \ + cinv x9, x9, lo __LF \ + mul x5, x4, x6 __LF \ + umulh x6, x4, x6 __LF \ + adds x0, x2, x15 __LF \ + adcs x15, x15, x1 __LF \ + adc x1, x1, xzr __LF \ + cmn x9, #1 __LF \ + eor x5, x5, x9 __LF \ + adcs x0, x5, x0 __LF \ + eor x6, x6, x9 __LF \ + adcs x15, x6, x15 __LF \ + adc x1, x1, x9 __LF \ + adds x9, x11, x7 __LF \ + adcs x10, x12, x8 __LF \ + adcs x11, x13, x11 __LF \ + adcs x12, x14, x12 __LF \ + adcs x13, x13, xzr __LF \ + adc x14, x14, xzr __LF \ + cmn x16, #1 __LF \ + eor x2, x2, x16 __LF \ + adcs x9, x2, x9 __LF \ + eor x0, x0, x16 __LF \ + adcs x10, x0, x10 __LF \ + eor x15, x15, x16 __LF \ + adcs x11, x15, x11 __LF \ + eor x1, x1, x16 __LF \ + adcs x12, x1, x12 __LF \ + adcs x13, x13, x16 __LF \ + adc x14, x14, x16 __LF \ + mov x16, #977 __LF \ + mul x3, pconst, x11 __LF \ + umulh x5, pconst, x11 __LF \ + and x15, x12, #0xffffffff __LF \ + lsr x2, x12, #32 __LF \ + mul x4, x16, x15 __LF \ + madd x15, x16, x2, x15 __LF \ + adds x4, x4, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x6, x2, x15 __LF \ + mul x11, pconst, x13 __LF \ + umulh x13, pconst, x13 __LF \ + and x15, x14, #0xffffffff __LF \ + lsr x2, x14, #32 __LF \ + mul x12, x16, x15 __LF \ + madd x15, x16, x2, x15 __LF \ + adds x12, x12, x15, lsl #32 __LF \ + lsr x15, x15, #32 __LF \ + adc x14, x2, x15 __LF \ + adds x7, x7, x3 __LF \ + adcs x8, x8, x4 __LF \ + adcs x9, x9, x11 __LF \ + adcs x10, x10, x12 __LF \ + cset x11, hs __LF \ + adds x8, x8, x5 __LF \ + adcs x9, x9, x6 __LF \ + adcs x10, x10, x13 __LF \ + adc x11, x11, x14 __LF \ + add x0, x11, #1 __LF \ + mul x3, x16, x0 __LF \ + lsr x4, x0, #32 __LF \ + adds x3, x3, x0, lsl #32 __LF \ + adc x4, xzr, x4 __LF \ + adds x7, x7, x3 __LF \ + adcs x8, x8, x4 __LF \ + adcs x9, x9, xzr __LF \ + adcs x10, x10, xzr __LF \ + csel x1, pconst, xzr, lo __LF \ + subs x7, x7, x1 __LF \ + sbcs x8, x8, xzr __LF \ + sbcs x9, x9, xzr __LF \ + sbc x10, x10, xzr __LF \ + stp x7, x8, [P0] __LF \ + stp x9, x10, [P0+16] + +// Corresponds exactly to bignum_sqr_p256k1 except for +// re-use of the pconst register for the constant 4294968273 + +#define sqr_p256k1(P0,P1) \ + ldp x10, x11, [P1] __LF \ + ldp x12, x13, [P1+16] __LF \ + umull x2, w10, w10 __LF \ + lsr x14, x10, #32 __LF \ + umull x3, w14, w14 __LF \ + umull x14, w10, w14 __LF \ + adds x2, x2, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x3, x3, x14 __LF \ + umull x4, w11, w11 __LF \ + lsr x14, x11, #32 __LF \ + umull x5, w14, w14 __LF \ + umull x14, w11, w14 __LF \ + mul x15, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x4, x4, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x5, x5, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x5, x5, xzr __LF \ + adds x3, x3, x15 __LF \ + adcs x4, x4, x16 __LF \ + adc x5, x5, xzr __LF \ + umull x6, w12, w12 __LF \ + lsr x14, x12, #32 __LF \ + umull x7, w14, w14 __LF \ + umull x14, w12, w14 __LF \ + adds x6, x6, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x7, x7, x14 __LF \ + umull x8, w13, w13 __LF \ + lsr x14, x13, #32 __LF \ + umull x9, w14, w14 __LF \ + umull x14, w13, w14 __LF \ + mul x15, x12, x13 __LF \ + umulh x16, x12, x13 __LF \ + adds x8, x8, x14, lsl #33 __LF \ + lsr x14, x14, #31 __LF \ + adc x9, x9, x14 __LF \ + adds x15, x15, x15 __LF \ + adcs x16, x16, x16 __LF \ + adc x9, x9, xzr __LF \ + adds x7, x7, x15 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, xzr __LF \ + subs x10, x10, x12 __LF \ + sbcs x11, x11, x13 __LF \ + csetm x16, lo __LF \ + eor x10, x10, x16 __LF \ + subs x10, x10, x16 __LF \ + eor x11, x11, x16 __LF \ + sbc x11, x11, x16 __LF \ + adds x6, x6, x4 __LF \ + adcs x7, x7, x5 __LF \ + adcs x8, x8, xzr __LF \ + adc x9, x9, xzr __LF \ + umull x12, w10, w10 __LF \ + lsr x5, x10, #32 __LF \ + umull x13, w5, w5 __LF \ + umull x5, w10, w5 __LF \ + adds x12, x12, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x13, x13, x5 __LF \ + umull x15, w11, w11 __LF \ + lsr x5, x11, #32 __LF \ + umull x14, w5, w5 __LF \ + umull x5, w11, w5 __LF \ + mul x4, x10, x11 __LF \ + umulh x16, x10, x11 __LF \ + adds x15, x15, x5, lsl #33 __LF \ + lsr x5, x5, #31 __LF \ + adc x14, x14, x5 __LF \ + adds x4, x4, x4 __LF \ + adcs x16, x16, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x13, x13, x4 __LF \ + adcs x15, x15, x16 __LF \ + adc x14, x14, xzr __LF \ + adds x4, x2, x6 __LF \ + adcs x5, x3, x7 __LF \ + adcs x6, x6, x8 __LF \ + adcs x7, x7, x9 __LF \ + csetm x16, lo __LF \ + subs x4, x4, x12 __LF \ + sbcs x5, x5, x13 __LF \ + sbcs x6, x6, x15 __LF \ + sbcs x7, x7, x14 __LF \ + adcs x8, x8, x16 __LF \ + adc x9, x9, x16 __LF \ + mov x16, #977 __LF \ + mul x10, pconst, x6 __LF \ + umulh x13, pconst, x6 __LF \ + and x6, x7, #0xffffffff __LF \ + lsr x7, x7, #32 __LF \ + mul x11, x16, x6 __LF \ + madd x6, x16, x7, x6 __LF \ + adds x11, x11, x6, lsl #32 __LF \ + lsr x6, x6, #32 __LF \ + adc x14, x7, x6 __LF \ + mul x12, pconst, x8 __LF \ + umulh x8, pconst, x8 __LF \ + and x6, x9, #0xffffffff __LF \ + lsr x7, x9, #32 __LF \ + mul x9, x16, x6 __LF \ + madd x6, x16, x7, x6 __LF \ + adds x9, x9, x6, lsl #32 __LF \ + lsr x6, x6, #32 __LF \ + adc x15, x7, x6 __LF \ + adds x2, x2, x10 __LF \ + adcs x3, x3, x11 __LF \ + adcs x4, x4, x12 __LF \ + adcs x5, x5, x9 __LF \ + cset x6, hs __LF \ + adds x3, x3, x13 __LF \ + adcs x4, x4, x14 __LF \ + adcs x5, x5, x8 __LF \ + adc x6, x6, x15 __LF \ + add x6, x6, #1 __LF \ + mul x10, x16, x6 __LF \ + lsr x11, x6, #32 __LF \ + adds x10, x10, x6, lsl #32 __LF \ + adc x11, xzr, x11 __LF \ + adds x2, x2, x10 __LF \ + adcs x3, x3, x11 __LF \ + adcs x4, x4, xzr __LF \ + adcs x5, x5, xzr __LF \ + csel x16, pconst, xzr, lo __LF \ + subs x2, x2, x16 __LF \ + sbcs x3, x3, xzr __LF \ + sbcs x4, x4, xzr __LF \ + sbc x5, x5, xzr __LF \ + stp x2, x3, [P0] __LF \ + stp x4, x5, [P0+16] + +// Corresponds exactly to bignum_sub_p256k1 + +#define sub_p256k1(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #0x3d1 __LF \ + orr x3, x4, #0x100000000 __LF \ + csel x3, x3, xzr, cc __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +S2N_BN_SYMBOL(secp256k1_jmixadd): + +// Save registers and make room on stack for temporary variables + + sub sp, sp, NSPACE+32 + stp x19, x20, [sp, NSPACE] + stp x21, x22, [sp, NSPACE+16] + +// Move the input arguments to stable place + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Set up pconst = 4294968273, so p_256k1 = 2^256 - pconst + + mov pconst, #977 + orr pconst, pconst, #0x100000000 + +// Main code, just a sequence of basic field operations + + sqr_p256k1(zp2,z_1) + mul_p256k1(y2a,z_1,y_2) + + mul_p256k1(x2a,zp2,x_2) + mul_p256k1(y2a,zp2,y2a) + + sub_p256k1(xd,x2a,x_1) + sub_p256k1(yd,y2a,y_1) + + sqr_p256k1(zz,xd) + sqr_p256k1(ww,yd) + + mul_p256k1(zzx1,zz,x_1) + mul_p256k1(zzx2,zz,x2a) + + sub_p256k1(resx,ww,zzx1) + sub_p256k1(t1,zzx2,zzx1) + + mul_p256k1(resz,xd,z_1) + + sub_p256k1(resx,resx,zzx2) + + sub_p256k1(t2,zzx1,resx) + + mul_p256k1(t1,t1,y_1) + mul_p256k1(t2,yd,t2) + + sub_p256k1(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + orr x4, x0, x1 + orr x5, x2, x3 + orr x4, x4, x5 + cmp x4, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x12, x13, [x_2] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x2, x3, [resx+16] + ldp x12, x13, [x_2+16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + + ldp x4, x5, [resy] + ldp x12, x13, [y_2] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x6, x7, [resy+16] + ldp x12, x13, [y_2+16] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + + ldp x8, x9, [resz] + mov x12, #1 + csel x8, x8, x12, ne + csel x9, x9, xzr, ne + ldp x10, x11, [resz+16] + csel x10, x10, xzr, ne + csel x11, x11, xzr, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [y_3] + stp x6, x7, [y_3+16] + stp x8, x9, [z_3] + stp x10, x11, [z_3+16] + +// Restore stack and return + + ldp x19, x20, [sp, NSPACE] + ldp x21, x22, [sp, NSPACE+16] + add sp, sp, NSPACE+32 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jmixadd_alt.S new file mode 100644 index 00000000000..d0135945453 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/secp256k1/secp256k1_jmixadd_alt.S @@ -0,0 +1,379 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on SECG curve secp256k1 in Jacobian coordinates +// +// extern void secp256k1_jmixadd_alt +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. It is assumed that +// all the coordinates of the input points p1 and p2 are fully reduced +// mod p_256k1, that the z coordinate of p1 is nonzero and that neither +// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine +// point as". +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jmixadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x15 +#define input_x x16 +#define input_y x17 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_mul_p256k1_alt + +#define mul_p256k1(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x0, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x1, x3, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x1, x1, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x1, x1, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x0, x0, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x1, x1, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x0, x0, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x1, x1, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x0, x0, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x1, x1, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + mov x7, #0x3d1 __LF \ + orr x7, x7, #0x100000000 __LF \ + mul x11, x7, x1 __LF \ + umulh x9, x7, x1 __LF \ + adds x12, x12, x11 __LF \ + mul x11, x7, x3 __LF \ + umulh x3, x7, x3 __LF \ + adcs x13, x13, x11 __LF \ + mul x11, x7, x4 __LF \ + umulh x4, x7, x4 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x7, x5 __LF \ + umulh x5, x7, x5 __LF \ + adcs x0, x0, x11 __LF \ + cset x1, cs __LF \ + adds x13, x13, x9 __LF \ + adcs x14, x14, x3 __LF \ + adcs x0, x0, x4 __LF \ + adc x1, x1, x5 __LF \ + add x8, x1, #0x1 __LF \ + mul x11, x7, x8 __LF \ + umulh x9, x7, x8 __LF \ + adds x12, x12, x11 __LF \ + adcs x13, x13, x9 __LF \ + adcs x14, x14, xzr __LF \ + adcs x0, x0, xzr __LF \ + csel x7, x7, xzr, cc __LF \ + subs x12, x12, x7 __LF \ + sbcs x13, x13, xzr __LF \ + sbcs x14, x14, xzr __LF \ + sbc x0, x0, xzr __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x0, [P0+16] + +// Corresponds exactly to bignum_sqr_p256k1_alt + +#define sqr_p256k1(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x7, x2, x4 __LF \ + umulh x6, x2, x4 __LF \ + adds x10, x10, x7 __LF \ + adcs x11, x11, x6 __LF \ + mul x7, x3, x4 __LF \ + umulh x6, x3, x4 __LF \ + adc x6, x6, xzr __LF \ + adds x11, x11, x7 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x6 __LF \ + mul x7, x3, x5 __LF \ + umulh x6, x3, x5 __LF \ + adc x6, x6, xzr __LF \ + adds x12, x12, x7 __LF \ + adcs x13, x13, x6 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x6, cs __LF \ + umulh x7, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x7 __LF \ + mul x7, x3, x3 __LF \ + adcs x10, x10, x7 __LF \ + umulh x7, x3, x3 __LF \ + adcs x11, x11, x7 __LF \ + mul x7, x4, x4 __LF \ + adcs x12, x12, x7 __LF \ + umulh x7, x4, x4 __LF \ + adcs x13, x13, x7 __LF \ + mul x7, x5, x5 __LF \ + adcs x14, x14, x7 __LF \ + umulh x7, x5, x5 __LF \ + adc x6, x6, x7 __LF \ + mov x3, #0x3d1 __LF \ + orr x3, x3, #0x100000000 __LF \ + mul x7, x3, x12 __LF \ + umulh x4, x3, x12 __LF \ + adds x8, x8, x7 __LF \ + mul x7, x3, x13 __LF \ + umulh x13, x3, x13 __LF \ + adcs x9, x9, x7 __LF \ + mul x7, x3, x14 __LF \ + umulh x14, x3, x14 __LF \ + adcs x10, x10, x7 __LF \ + mul x7, x3, x6 __LF \ + umulh x6, x3, x6 __LF \ + adcs x11, x11, x7 __LF \ + cset x12, cs __LF \ + adds x9, x9, x4 __LF \ + adcs x10, x10, x13 __LF \ + adcs x11, x11, x14 __LF \ + adc x12, x12, x6 __LF \ + add x2, x12, #0x1 __LF \ + mul x7, x3, x2 __LF \ + umulh x6, x3, x2 __LF \ + adds x8, x8, x7 __LF \ + adcs x9, x9, x6 __LF \ + adcs x10, x10, xzr __LF \ + adcs x11, x11, xzr __LF \ + csel x3, x3, xzr, cc __LF \ + subs x8, x8, x3 __LF \ + sbcs x9, x9, xzr __LF \ + sbcs x10, x10, xzr __LF \ + sbc x11, x11, xzr __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Corresponds exactly to bignum_sub_p256k1 + +#define sub_p256k1(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + mov x4, #0x3d1 __LF \ + orr x3, x4, #0x100000000 __LF \ + csel x3, x3, xzr, cc __LF \ + subs x5, x5, x3 __LF \ + sbcs x6, x6, xzr __LF \ + sbcs x7, x7, xzr __LF \ + sbc x8, x8, xzr __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +S2N_BN_SYMBOL(secp256k1_jmixadd_alt): + +// Make room on stack for temporary variables +// Move the input arguments to stable places + + sub sp, sp, NSPACE + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations + + sqr_p256k1(zp2,z_1) + mul_p256k1(y2a,z_1,y_2) + + mul_p256k1(x2a,zp2,x_2) + mul_p256k1(y2a,zp2,y2a) + + sub_p256k1(xd,x2a,x_1) + sub_p256k1(yd,y2a,y_1) + + sqr_p256k1(zz,xd) + sqr_p256k1(ww,yd) + + mul_p256k1(zzx1,zz,x_1) + mul_p256k1(zzx2,zz,x2a) + + sub_p256k1(resx,ww,zzx1) + sub_p256k1(t1,zzx2,zzx1) + + mul_p256k1(resz,xd,z_1) + + sub_p256k1(resx,resx,zzx2) + + sub_p256k1(t2,zzx1,resx) + + mul_p256k1(t1,t1,y_1) + mul_p256k1(t2,yd,t2) + + sub_p256k1(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + orr x4, x0, x1 + orr x5, x2, x3 + orr x4, x4, x5 + cmp x4, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x12, x13, [x_2] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x2, x3, [resx+16] + ldp x12, x13, [x_2+16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + + ldp x4, x5, [resy] + ldp x12, x13, [y_2] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x6, x7, [resy+16] + ldp x12, x13, [y_2+16] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + + ldp x8, x9, [resz] + mov x12, #1 + csel x8, x8, x12, ne + csel x9, x9, xzr, ne + ldp x10, x11, [resz+16] + csel x10, x10, xzr, ne + csel x11, x11, xzr, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [y_3] + stp x6, x7, [y_3+16] + stp x8, x9, [z_3] + stp x10, x11, [z_3+16] + +// Restore stack and return + + add sp, sp, NSPACE + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/Makefile new file mode 100644 index 00000000000..216db41a3f8 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/Makefile @@ -0,0 +1,58 @@ +############################################################################# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +############################################################################# + +# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise +# use a cross-assembling version so that the code can still be assembled +# and the proofs checked against the object files (though you won't be able +# to run code without additional emulation infrastructure). The aarch64 +# cross-assembling version can be installed manually by something like: +# +# sudo apt-get install binutils-aarch64-linux-gnu + +UNAME_RESULT=$(shell uname -p) + +ifeq ($(UNAME_RESULT),aarch64) +GAS=as +else +GAS=aarch64-linux-gnu-as +endif + +# List of object files + +OBJ = bignum_add_sm2.o \ + bignum_cmul_sm2.o \ + bignum_deamont_sm2.o \ + bignum_demont_sm2.o \ + bignum_double_sm2.o \ + bignum_half_sm2.o \ + bignum_inv_sm2.o \ + bignum_mod_nsm2.o \ + bignum_mod_nsm2_4.o \ + bignum_mod_sm2.o \ + bignum_mod_sm2_4.o \ + bignum_montinv_sm2.o \ + bignum_montmul_sm2.o \ + bignum_montmul_sm2_alt.o \ + bignum_montsqr_sm2.o \ + bignum_montsqr_sm2_alt.o \ + bignum_neg_sm2.o \ + bignum_optneg_sm2.o \ + bignum_sub_sm2.o \ + bignum_tomont_sm2.o \ + bignum_triple_sm2.o \ + sm2_montjadd.o \ + sm2_montjadd_alt.o \ + sm2_montjdouble.o \ + sm2_montjdouble_alt.o \ + sm2_montjmixadd.o \ + sm2_montjmixadd_alt.o \ + sm2_montjscalarmul.o \ + sm2_montjscalarmul_alt.o + +%.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - + +default: $(OBJ); + +clean:; rm -f *.o *.correct diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_add_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_add_sm2.S new file mode 100644 index 00000000000..84656cf9cfe --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_add_sm2.S @@ -0,0 +1,73 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Add modulo p_sm2, z := (x + y) mod p_sm2, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_add_sm2 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_sm2) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 +#define c x3 +#define d0 x4 +#define d1 x5 +#define d2 x6 +#define d3 x7 +#define n0 x8 +#define n1 x9 +#define n2 x10 +#define n3 x11 + +S2N_BN_SYMBOL(bignum_add_sm2): + +// First just add the numbers as [c;d3;d2;d1;d0] + + ldp d0, d1, [x] + ldp n0, n1, [y] + adds d0, d0, n0 + adcs d1, d1, n1 + ldp d2, d3, [x, #16] + ldp n2, n3, [y, #16] + adcs d2, d2, n2 + adcs d3, d3, n3 + adc c, xzr, xzr + +// Now let [c;n3;n2;n1;n0] = [c;d3;d2;d1;d0] - p_sm2 + + subs n0, d0, #0xffffffffffffffff + mov n1, #0xffffffff00000000 + sbcs n1, d1, n1 + adcs n2, d2, xzr + mov n3, #0xfffffffeffffffff + sbcs n3, d3, n3 + sbcs c, c, xzr + +// Select result according to whether (x + y) - p_sm2 < 0 + + csel d0, d0, n0, cc + csel d1, d1, n1, cc + csel d2, d2, n2, cc + csel d3, d3, n3, cc + +// Store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_cmul_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_cmul_sm2.S new file mode 100644 index 00000000000..75a54982399 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_cmul_sm2.S @@ -0,0 +1,103 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming +// x reduced +// Inputs c, x[4]; output z[4] +// +// extern void bignum_cmul_sm2 +// (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = c, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_sm2) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_sm2_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_sm2_alt) + .text + .balign 4 + +#define z x0 +#define m x1 +#define x x2 + +#define d0 x3 +#define d1 x4 +#define d2 x5 +#define d3 x6 +#define h x7 +#define q x8 +#define a1 x9 +#define a2 x10 +#define a3 x11 +#define a4 x12 + +S2N_BN_SYMBOL(bignum_cmul_sm2): +S2N_BN_SYMBOL(bignum_cmul_sm2_alt): + +// First do the multiply, straightforwardly to get [h;d3;d2;d1;d0] + + ldp a1, a2, [x] + ldp a3, a4, [x, #16] + mul d0, m, a1 + mul d1, m, a2 + mul d2, m, a3 + mul d3, m, a4 + umulh a1, m, a1 + umulh a2, m, a2 + umulh a3, m, a3 + umulh h, m, a4 + adds d1, d1, a1 + adcs d2, d2, a2 + adcs d3, d3, a3 + adc h, h, xzr + +// Quotient approximation is (h * (1 + 2^32 + 2^64) + d3 + 2^64) >> 64. +// Note that by hypothesis our product is <= (2^64 - 1) * (p_sm2 - 1), +// so there is no need to max this out to avoid wrapping, unlike in the +// more general case of bignum_mod_sm2. + + adds a3, d3, h + mov a2, #1 + adc a1, h, a2 + add a2, h, a3, lsr #32 + add q, a1, a2, lsr #32 + +// Let a3 = q<<32 and a4 = q>>32 then [a2;a1] = 2^32 * q - q + + lsl a3, q, #32 + subs a1, a3, q + lsr a4, q, #32 + sbc a2, a4, xzr + +// Do the basic correction as [h;d3;d2;d1;d0] := [h;d3;d2;d1;d0] - q * p_sm2 + + sub h, h, q + adds d0, d0, q + adcs d1, d1, a1 + adcs d2, d2, a2 + adcs d3, d3, a3 + adc h, h, a4 + +// Use top word (which will be all zeros or all ones) as a mask to correct + + adds d0, d0, h + and a1, h, #0xffffffff00000000 + adcs d1, d1, a1 + adcs d2, d2, h + and a3, h, #0xfffffffeffffffff + adc d3, d3, a3 + +// Finally store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_deamont_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_deamont_sm2.S new file mode 100644 index 00000000000..0cc467ea18b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_deamont_sm2.S @@ -0,0 +1,109 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from almost-Montgomery form, z := (x / 2^256) mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_deamont_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form, +// "almost" meaning any 4-digit input will work, with no range restriction. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_sm2) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1], and using t0, t1, t2 and t3 as +// temporaries. It is fine for d4 to be the same register as d0, +// and it often is. +// --------------------------------------------------------------------------- + +#define montreds(d4,d3,d2,d1,d0, t3,t2,t1,t0) \ +/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0 */ \ + lsl t2, d0, #32 __LF \ + lsr t3, d0, #32 __LF \ + subs t0, t2, d0 __LF \ + sbc t1, t3, xzr __LF \ +/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0] */ \ + subs d1, d1, t0 __LF \ + sbcs d2, d2, t1 __LF \ + sbcs d3, d3, t2 __LF \ + sbc d4, d0, t3 + +// Input parameters + +#define z x0 +#define x x1 + +// Rotating registers for the intermediate windows (with repetitions) + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 + +// Other temporaries + +#define t x6 +#define u x7 +#define v x8 +#define w x9 + +S2N_BN_SYMBOL(bignum_deamont_sm2): + +// Set up an initial window with the input x + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Systematically scroll left doing 1-step reductions. This process +// keeps things inside 4 digits (i.e. < 2^256) at each stage, since +// we have w * p_sm2 + x <= (2^64 - 1) * p_sm2 + (2 EXP 256 - 1) +// <= (2^64 - 1) * (2^256 - 1) + (2 EXP 256 - 1) <= 2^64 * (2^256 - 1) + + montreds(d0,d3,d2,d1,d0, t,u,v,w) + + montreds(d1,d0,d3,d2,d1, t,u,v,w) + + montreds(d2,d1,d0,d3,d2, t,u,v,w) + + montreds(d3,d2,d1,d0,d3, t,u,v,w) + +// Let [w;v;u;t] = [d3;d2;d1;d0] - p_sm2 + + subs t, d0, #-1 + mov u, #0xffffffff00000000 + sbcs u, d1, u + adcs v, d2, xzr + mov w, #0xfffffffeffffffff + sbcs w, d3, w + +// If [d3;d2;d1;d0] < p_sm2 then [d3;d2;d1;d0] is the final answer, +// being reduced mod p_sm2, otherwise [d3;d2;d1;d0] - p_sm2. + + csel d0, d0, t, cc + csel d1, d1, u, cc + csel d2, d2, v, cc + csel d3, d3, w, cc + +// Write back result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_demont_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_demont_sm2.S new file mode 100644 index 00000000000..a10906d4f1b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_demont_sm2.S @@ -0,0 +1,90 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from Montgomery form z := (x / 2^256) mod p_sm2, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_demont_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// This assumes the input is < p_sm2 for correctness. If this is not the case, +// use the variant "bignum_deamont_sm2" instead. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_sm2) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1], and using t0, t1, t2 and t3 as +// temporaries. It is fine for d4 to be the same register as d0, +// and it often is. +// --------------------------------------------------------------------------- + +#define montreds(d4,d3,d2,d1,d0, t3,t2,t1,t0) \ +/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0 */ \ + lsl t2, d0, #32 __LF \ + lsr t3, d0, #32 __LF \ + subs t0, t2, d0 __LF \ + sbc t1, t3, xzr __LF \ +/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0] */ \ + subs d1, d1, t0 __LF \ + sbcs d2, d2, t1 __LF \ + sbcs d3, d3, t2 __LF \ + sbc d4, d0, t3 + +// Input parameters + +#define z x0 +#define x x1 + +// Rotating registers for the intermediate windows (with repetitions) + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 + +// Other temporaries + +#define t x6 +#define u x7 +#define v x8 +#define w x9 + +S2N_BN_SYMBOL(bignum_demont_sm2): + +// Set up an initial window with the input x + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Systematically scroll left doing 1-step reductions. This process +// keeps things reduced < p_sm2 at each stage, since we have +// w * p_sm2 + x <= (2^64 - 1) * p_sm2 + (p_sm2 - 1) < 2^64 * p_sm2 + + montreds(d0,d3,d2,d1,d0, t,u,v,w) + + montreds(d1,d0,d3,d2,d1, t,u,v,w) + + montreds(d2,d1,d0,d3,d2, t,u,v,w) + + montreds(d3,d2,d1,d0,d3, t,u,v,w) + +// Write back result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_double_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_double_sm2.S new file mode 100644 index 00000000000..629c3c33174 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_double_sm2.S @@ -0,0 +1,72 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Double modulo p_sm2, z := (2 * x) mod p_sm2, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_double_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_sm2) + .text + .balign 4 + +#define z x0 +#define x x1 +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define c x6 +#define n0 x7 +#define n1 x8 +#define n2 x9 +#define n3 x10 + +S2N_BN_SYMBOL(bignum_double_sm2): + +// Double the input number as 2 * x = c + [d3; d2; d1; d0] +// It's worth considering doing this with extr...63 instead + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + adds d0, d0, d0 + adcs d1, d1, d1 + adcs d2, d2, d2 + adcs d3, d3, d3 + adc c, xzr, xzr + +// Subtract p_sm2 to give 2 * x - p_sm2 = c + [n3; n2; n1; n0] + + subs n0, d0, #0xffffffffffffffff + mov n1, #0xffffffff00000000 + sbcs n1, d1, n1 + adcs n2, d2, xzr + mov n3, #0xfffffffeffffffff + sbcs n3, d3, n3 + sbcs c, c, xzr + +// Now CF is set (because of inversion) if 2 * x >= p_sm2, in which case the +// correct result is [n3; n2; n1; n0], otherwise [d3; d2; d1; d0] + + csel d0, d0, n0, cc + csel d1, d1, n1, cc + csel d2, d2, n2, cc + csel d3, d3, n3, cc + +// Store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_half_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_half_sm2.S new file mode 100644 index 00000000000..b144c9757bc --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_half_sm2.S @@ -0,0 +1,72 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Halve modulo p_sm2, z := (x / 2) mod p_sm2, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_half_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_half_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_half_sm2) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define d4 x6 +#define m x7 +#define n x8 + + +S2N_BN_SYMBOL(bignum_half_sm2): + +// Load the 4 digits of x + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Get a bitmask corresponding to the lowest bit of the input + + and m, d0, #1 + neg m, m + +// Do a masked addition of p_sm2, catching carry in a 5th word + + adds d0, d0, m + and n, m, #0xffffffff00000000 + adcs d1, d1, n + adcs d2, d2, m + and n, m, #0xfffffffeffffffff + adcs d3, d3, n + adc d4, xzr, xzr + +// Now shift that sum right one place + + extr d0, d1, d0, #1 + extr d1, d2, d1, #1 + extr d2, d3, d2, #1 + extr d3, d4, d3, #1 + +// Store back + + stp d0, d1, [z] + stp d2, d3, [z, #16] + +// Return + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_inv_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_inv_sm2.S new file mode 100644 index 00000000000..c28197c2da8 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_inv_sm2.S @@ -0,0 +1,1270 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1 +// Input x[4]; output z[4] +// +// extern void bignum_inv_sm2(uint64_t z[static 4],uint64_t x[static 4]); +// +// If the 4-digit input x is coprime to p_sm2, i.e. is not divisible +// by it, returns z < p_sm2 such that x * z == 1 (mod p_sm2). Note that +// x does not need to be reduced modulo p_sm2, but the output always is. +// If the input is divisible (i.e. is 0 or p_sm2), then there can be no +// modular inverse and z = 0 is returned. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_sm2) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Used for the return pointer + +#define res x20 + +// Loop counter and d = 2 * delta value for divstep + +#define i x21 +#define d x22 + +// Registers used for matrix element magnitudes and signs + +#define m00 x10 +#define m01 x11 +#define m10 x12 +#define m11 x13 +#define s00 x14 +#define s01 x15 +#define s10 x16 +#define s11 x17 + +// Initial carries for combinations + +#define car0 x9 +#define car1 x19 + +// Input and output, plain registers treated according to pattern + +#define reg0 x0, #0 +#define reg1 x1, #0 +#define reg2 x2, #0 +#define reg3 x3, #0 +#define reg4 x4, #0 + +#define x x1, #0 +#define z x0, #0 + +// Pointer-offset pairs for temporaries on stack + +#define f sp, #0 +#define g sp, #(6*N) +#define u sp, #(12*N) +#define v sp, #(16*N) + +// Total size to reserve on the stack + +#define NSPACE #(20*N) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro. Takes input in +// [d4;d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to +// the existing [d4;d3;d2;d1], and re-using d0 as a temporary internally +// as well as t0, t1, t2, t3. This is almost-Montgomery, i.e. the result +// fits in 4 digits but is not necessarily strictly reduced mod p_sm2. +// --------------------------------------------------------------------------- + +#define amontred(d4,d3,d2,d1,d0, t3,t2,t1,t0) \ +/* We only know the input is -2^316 < x < 2^316. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_sm2. */ \ + mov t0, #0xe000000000000000 __LF \ + adds d0, d0, t0 __LF \ + mov t1, #0x1fffffffffffffff __LF \ + adcs d1, d1, t1 __LF \ + mov t2, #0xffffffffe0000000 __LF \ + adcs d2, d2, t2 __LF \ + sbcs d3, d3, xzr __LF \ + and t0, t1, #0xffffffffdfffffff __LF \ + adc d4, d4, t0 __LF \ +/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0 */ \ + lsl t2, d0, #32 __LF \ + lsr t3, d0, #32 __LF \ + subs t0, t2, d0 __LF \ + sbc t1, t3, xzr __LF \ +/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0] */ \ + subs d1, d1, t0 __LF \ + sbcs d2, d2, t1 __LF \ + sbcs d3, d3, t2 __LF \ + sbc t0, d0, t3 __LF \ + adds d4, d4, t0 __LF \ +/* Now capture top carry and subtract p_sm2 if set (almost-Montgomery) */ \ + csetm t0, cs __LF \ + subs d1, d1, t0 __LF \ + and t1, t0, #0xffffffff00000000 __LF \ + sbcs d2, d2, t1 __LF \ + and t2, t0, #0xfffffffeffffffff __LF \ + sbcs d3, d3, t0 __LF \ + sbc d4, d4, t2 + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix in +// registers as follows +// +// [ m00 m01] +// [ m10 m11] + +#define divstep59() \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x8, x4, #0x100, lsl #12 __LF \ + sbfx x8, x8, #21, #21 __LF \ + mov x11, #0x100000 __LF \ + add x11, x11, x11, lsl #21 __LF \ + add x9, x4, x11 __LF \ + asr x9, x9, #42 __LF \ + add x10, x5, #0x100, lsl #12 __LF \ + sbfx x10, x10, #21, #21 __LF \ + add x11, x5, x11 __LF \ + asr x11, x11, #42 __LF \ + mul x6, x8, x2 __LF \ + mul x7, x9, x3 __LF \ + mul x2, x10, x2 __LF \ + mul x3, x11, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #21, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #42 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #21, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #42 __LF \ + mul x6, x12, x2 __LF \ + mul x7, x13, x3 __LF \ + mul x2, x14, x2 __LF \ + mul x3, x15, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + mul x2, x12, x8 __LF \ + mul x3, x12, x9 __LF \ + mul x6, x14, x8 __LF \ + mul x7, x14, x9 __LF \ + madd x8, x13, x10, x2 __LF \ + madd x9, x13, x11, x3 __LF \ + madd x16, x15, x10, x6 __LF \ + madd x17, x15, x11, x7 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #22, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #43 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #22, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #43 __LF \ + mneg x2, x12, x8 __LF \ + mneg x3, x12, x9 __LF \ + mneg x4, x14, x8 __LF \ + mneg x5, x14, x9 __LF \ + msub m00, x13, x16, x2 __LF \ + msub m01, x13, x17, x3 __LF \ + msub m10, x15, x16, x4 __LF \ + msub m11, x15, x17, x5 + +S2N_BN_SYMBOL(bignum_inv_sm2): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Copy the prime and input into the main f and g variables respectively. +// Make sure x is reduced so that g <= f as assumed in the bound proof. + + mov x10, #0xffffffffffffffff + mov x11, #0xffffffff00000000 + mov x13, #0xfffffffeffffffff + stp x10, x11, [f] + stp x10, x13, [f+2*N] + str xzr, [f+4*N] + + ldp x2, x3, [x1] + subs x10, x2, #-1 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #(2*N)] + adcs x12, x4, xzr + sbcs x13, x5, x13 + + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + + stp x2, x3, [g] + stp x4, x5, [g+2*N] + str xzr, [g+4*N] + +// Also maintain reduced < 2^256 vector [u,v] such that +// [f,g] == x * 2^{5*i-50} * [u,v] (mod p_sm2) +// starting with [p_sm2,x] == x * 2^{5*0-50} * [0,2^50] (mod p_sm2) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. + + stp xzr, xzr, [u] + stp xzr, xzr, [u+2*N] + + mov x10, #0x0004000000000000 + stp x10, xzr, [v] + stp xzr, xzr, [v+2*N] + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special tenth iteration after a uniform +// first 9. + + mov i, #10 + mov d, #1 + b bignum_inv_sm2_midloop + +bignum_inv_sm2_loop: + +// Separate the matrix elements into sign-magnitude pairs + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in stable registers for the [u,v] part and do [f,g] first. + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + + and x0, m10, s10 + and x1, m11, s11 + add car1, x0, x1 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + ldr x7, [f] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [g] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + +// Digit 1 of [f,g] + + ldr x7, [f+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g] + +// Digit 2 of [f,g] + + ldr x7, [f+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+N] + +// Digits 3 and 4 of [f,g] + + ldr x7, [f+3*N] + eor x1, x7, s00 + ldr x23, [f+4*N] + eor x3, x23, s00 + and x3, x3, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [g+3*N] + eor x1, x8, s01 + ldr x24, [g+4*N] + eor x0, x24, s01 + and x0, x0, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+2*N] + extr x5, x3, x5, #59 + str x5, [f+3*N] + asr x3, x3, #59 + str x3, [f+4*N] + + eor x1, x7, s10 + eor x5, x23, s10 + and x5, x5, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + eor x0, x24, s11 + and x0, x0, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [g+2*N] + extr x2, x5, x2, #59 + str x2, [g+3*N] + asr x5, x5, #59 + str x5, [g+4*N] + +// Now the computation of the updated u and v values and their +// Montgomery reductions. A very similar accumulation except that +// the top words of u and v are unsigned and we don't shift. +// +// Digit 0 of [u,v] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v] + adc x3, x3, x1 + +// Digit 1 of [u,v] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+N] + adc x4, x4, x1 + +// Digit 2 of [u,v] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+2*N] + adc x2, x2, x1 + +// Digits 3 and 4 of u (top is unsigned) + + ldr x7, [u+3*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Montgomery reduction of u + + ldp x0, x1, [u] + ldr x6, [u+2*N] + amontred(x3,x5,x6,x1,x0, x24,x10,x11,x14) + stp x1, x6, [u] + stp x5, x3, [u+16] + +// Digits 3 and 4 of v (top is unsigned) + + eor x1, x7, s10 + and x5, s10, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + and x0, s11, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + +// Montgomery reduction of v + + ldp x0, x1, [v] + ldr x3, [v+2*N] + amontred(x5,x2,x3,x1,x0, x24,x10,x11,x14) + stp x1, x3, [v] + stp x2, x5, [v+16] + +bignum_inv_sm2_midloop: + + mov x1, d + ldr x2, [f] + ldr x3, [g] + divstep59() + mov d, x1 + +// Next iteration + + subs i, i, #1 + bne bignum_inv_sm2_loop + +// The 10th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + ldr x0, [f] + ldr x1, [g] + mul x0, x0, m00 + madd x1, x1, m01, x0 + asr x0, x1, #63 + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * [u,v] (mod p_sm2) +// we want to flip the sign of u according to that of f. + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + eor s00, s00, x0 + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + eor s01, s01, x0 + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + eor s10, s10, x0 + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + eor s11, s11, x0 + +// Adjust the initial value to allow for complement instead of negation + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + +// Digit 0 of [u] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + +// Digit 1 of [u] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + +// Digit 2 of [u] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + +// Digits 3 and 4 of u (top is unsigned) + + ldr x7, [u+3*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Montgomery reduction of u. This needs to be strict not "almost" +// so it is followed by an optional subtraction of p_sm2 + + ldp x0, x1, [u] + ldr x2, [u+2*N] + amontred(x3,x5,x2,x1,x0, x24,x10,x11,x14) + + mov x10, #0xffffffffffffffff + subs x10, x1, #-1 + mov x11, #0xffffffff00000000 + sbcs x11, x2, x11 + mov x13, #0xfffffffeffffffff + adcs x12, x5, xzr + sbcs x13, x3, x13 + + csel x10, x1, x10, cc + csel x11, x2, x11, cc + csel x12, x5, x12, cc + csel x13, x3, x13, cc + +// Store it back to the final output + + stp x10, x11, [res] + stp x12, x13, [res, #16] + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_nsm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_nsm2.S new file mode 100644 index 00000000000..f81048a14a3 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_nsm2.S @@ -0,0 +1,175 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod n_sm2 +// Input x[k]; output z[4] +// +// extern void bignum_mod_nsm2 +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Reduction is modulo the group order of the GM/T 0003-2012 curve SM2. +// +// Standard ARM ABI: X0 = z, X1 = k, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_nsm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_nsm2) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_nsm2_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_nsm2_alt) + .text + .balign 4 + +#define z x0 +#define k x1 +#define x x2 + +#define m0 x3 +#define m1 x4 +#define m2 x5 +#define m3 x6 + +#define t0 x7 +#define t1 x8 +#define t2 x9 +#define t3 x10 +#define t4 x11 + +#define n0 x12 +#define n1 x13 +#define n3 x14 + +// These two are aliased: we only load d when finished with q + +#define q x15 +#define d x15 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(bignum_mod_nsm2): + +S2N_BN_SYMBOL(bignum_mod_nsm2_alt): + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmp k, #4 + bcc bignum_mod_nsm2_short + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 + + sub k, k, #4 + lsl t0, k, #3 + add t0, t0, x + ldp m2, m3, [t0, #16] + ldp m0, m1, [t0] + +// Load the complicated three words of 2^256 - n_sm2 = [n3; 0; n1; n0] + + movbig(n0, #0xac44, #0x0bf6, #0xc62a, #0xbedd) + movbig(n1, #0x8dfc, #0x2094, #0xde39, #0xfad4) + mov n3, 0x0000000100000000 + +// Reduce the top 4 digits mod n_sm2 (a conditional subtraction of n_sm2) + + adds t0, m0, n0 + adcs t1, m1, n1 + adcs t2, m2, xzr + adcs t3, m3, n3 + csel m0, m0, t0, cc + csel m1, m1, t1, cc + csel m2, m2, t2, cc + csel m3, m3, t3, cc + +// Now do (k-4) iterations of 5->4 word modular reduction + + cbz k, bignum_mod_nsm2_writeback +bignum_mod_nsm2_loop: + +// Writing the input, with the new zeroth digit implicitly appended, as +// z = 2^256 * m3 + 2^192 * m2 + t, our intended quotient approximation is +// MIN ((m3 * (1 + 2^32 + 2^64) + m2 + 2^64) >> 64) (2^64 - 1) + + adds t0, m2, m3 + mov t2, #1 + adc t1, m3, t2 + add t2, m3, t0, lsr #32 + adds q, t1, t2, lsr #32 + cinv q, q, cs + +// [t4;t3;t2;t1;t0] = q * (2^256 - n_sm2) + + mul t0, n0, q + mul t1, n1, q + mul t3, n3, q + umulh t2, n0, q + adds t1, t1, t2 + umulh t2, n1, q + adc t2, t2, xzr // No carry: high of mul + {0,1} + umulh t4, n3, q + +// Compensate for 2^256 * q + + sub m3, m3, q + +// Decrement k and load the next digit (note that d aliases to q) + + sub k, k, #1 + ldr d, [x, k, lsl #3] + +// [t4;t3;t2;t1;t0] = [m3;m2;m1;m0;d] - q * n_sm2 + + adds t0, d, t0 + adcs t1, m0, t1 + adcs t2, m1, t2 + adcs t3, m2, t3 + adc t4, m3, t4 + +// Now our top word t4 is either zero or all 1s. Use it for a masked +// addition of n_sm2, which we can do by a *subtraction* of +// 2^256 - n_sm2 from our portion, re-using the constants + + and d, t4, n0 + subs m0, t0, d + and d, t4, n1 + sbcs m1, t1, d + sbcs m2, t2, xzr + and d, t4, n3 + sbc m3, t3, d + + cbnz k, bignum_mod_nsm2_loop + +// Finally write back [m3;m2;m1;m0] and return + +bignum_mod_nsm2_writeback: + stp m0, m1, [z] + stp m2, m3, [z, #16] + ret + +// Short case: just copy the input with zero-padding + +bignum_mod_nsm2_short: + mov m0, xzr + mov m1, xzr + mov m2, xzr + mov m3, xzr + + cbz k, bignum_mod_nsm2_writeback + ldr m0, [x] + subs k, k, #1 + beq bignum_mod_nsm2_writeback + ldr m1, [x, #8] + subs k, k, #1 + beq bignum_mod_nsm2_writeback + ldr m2, [x, #16] + b bignum_mod_nsm2_writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_nsm2_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_nsm2_4.S new file mode 100644 index 00000000000..dd1bc66bea2 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_nsm2_4.S @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod n_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_mod_nsm2_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Reduction is modulo the group order of the GM/T 0003-2012 curve SM2. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_nsm2_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_nsm2_4) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define n0 x2 +#define n1 x3 +#define n2 x4 +#define n3 x5 + +#define d0 x6 +#define d1 x7 +#define d2 x8 +#define d3 x9 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(bignum_mod_nsm2_4): + +// Load the complicated three words of n_sm2, the other being all 1s + + movbig( n0, #0x53BB, #0xF409, #0x39D5, #0x4123) + movbig( n1, #0x7203, #0xDF6B, #0x21C6, #0x052B) + mov n3, #0xFFFFFFFEFFFFFFFF + +// Load the input number + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Do the subtraction. Since word 2 of n_sm2 is all 1s, that can be +// done by adding zero with carry, thanks to the inverted carry. + + subs n0, d0, n0 + sbcs n1, d1, n1 + adcs n2, d2, xzr + sbcs n3, d3, n3 + +// Now if the carry is *clear* (inversion at work) the subtraction carried +// and hence we should have done nothing, so we reset each n_i = d_i + + csel n0, d0, n0, cc + csel n1, d1, n1, cc + csel n2, d2, n2, cc + csel n3, d3, n3, cc + +// Store the end result + + stp n0, n1, [z] + stp n2, n3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_sm2.S new file mode 100644 index 00000000000..e847008b1b2 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_sm2.S @@ -0,0 +1,150 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_sm2 +// Input x[k]; output z[4] +// +// extern void bignum_mod_sm2 +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Standard ARM ABI: X0 = z, X1 = k, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_sm2) + .text + .balign 4 + +#define z x0 +#define k x1 +#define x x2 + +#define m0 x3 +#define m1 x4 +#define m2 x5 +#define m3 x6 + +#define t0 x7 +#define t1 x8 +#define t2 x9 +#define t3 x10 +#define t4 x11 + +#define n1 x12 +#define n3 x13 + +#define q x14 + + +S2N_BN_SYMBOL(bignum_mod_sm2): + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmp k, #4 + bcc bignum_mod_sm2_short + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 + + sub k, k, #4 + lsl t0, k, #3 + add t0, t0, x + ldp m2, m3, [t0, #16] + ldp m0, m1, [t0] + +// Load the complicated words of p_sm2 = [n3;-1;n1;-1] + + mov n1, #0xffffffff00000000 + mov n3, #0xfffffffeffffffff + +// Reduce the top 4 digits mod p_sm2 (a conditional subtraction of p_sm2) + + subs t0, m0, #-1 + sbcs t1, m1, n1 + adcs t2, m2, xzr + sbcs t3, m3, n3 + + csel m0, m0, t0, cc + csel m1, m1, t1, cc + csel m2, m2, t2, cc + csel m3, m3, t3, cc + +// Now do (k-4) iterations of 5->4 word modular reduction + + cbz k, bignum_mod_sm2_writeback +bignum_mod_sm2_loop: + +// Decrement k and load the next digit as t0. We then want to reduce +// [m3;m2;m1;m0;t0] |-> [m3;m2;m1;m0]; the shuffling downwards is absorbed +// into the various ALU operations + + sub k, k, #1 + ldr t0, [x, k, lsl #3] + +// Writing the input, with the new zeroth digit t0 appended, as +// z = 2^256 * m3 + 2^192 * m2 + t, our intended quotient approximation is +// MIN ((m3 * (1 + 2^32 + 2^64) + m2 + 2^64) >> 64) (2^64 - 1) + + adds t3, m2, m3 + mov t2, #1 + adc t1, m3, t2 + add t2, m3, t3, lsr #32 + adds q, t1, t2, lsr #32 + cinv q, q, cs + +// Let t3 = q<<32 and t4 = q>>32 then [t2;t1] = 2^32 * q - q + + lsl t3, q, #32 + subs t1, t3, q + lsr t4, q, #32 + sbc t2, t4, xzr + +// Do the basic correction to get [t4;t2;t2;t1;t0] = [m3;m2;m1;m0;t0] - q * p + + adds t0, t0, q + adcs t1, t1, m0 + sub m3, m3, q + adcs t2, t2, m1 + adcs t3, t3, m2 + adc t4, t4, m3 + +// Use top word as mask to correct + + adds m0, t0, t4 + and t0, n1, t4 + adcs m1, t1, t0 + adcs m2, t2, t4 + and t0, n3, t4 + adc m3, t3, t0 + + cbnz k, bignum_mod_sm2_loop + +// Finally write back [m3;m2;m1;m0] and return + +bignum_mod_sm2_writeback: + stp m0, m1, [z] + stp m2, m3, [z, #16] + ret + +// Short case: just copy the input with zero-padding + +bignum_mod_sm2_short: + mov m0, xzr + mov m1, xzr + mov m2, xzr + mov m3, xzr + + cbz k, bignum_mod_sm2_writeback + ldr m0, [x] + subs k, k, #1 + beq bignum_mod_sm2_writeback + ldr m1, [x, #8] + subs k, k, #1 + beq bignum_mod_sm2_writeback + ldr m2, [x, #16] + b bignum_mod_sm2_writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_sm2_4.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_sm2_4.S new file mode 100644 index 00000000000..4654f667989 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_mod_sm2_4.S @@ -0,0 +1,70 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_mod_sm2_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_sm2_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_sm2_4) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define n0 x2 +#define n1 x3 +#define n2 x4 +#define n3 x5 + +#define d0 x6 +#define d1 x7 +#define d2 x8 +#define d3 x9 + + +S2N_BN_SYMBOL(bignum_mod_sm2_4): + +// Load the non-trivial words of p_sm2 = [n3;-1;n2;-1] + + mov n1, #0xffffffff00000000 + mov n3, #0xfffffffeffffffff + +// Load the input number + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Do the subtraction. + + subs n0, d0, #-1 + sbcs n1, d1, n1 + adcs n2, d2, xzr + sbcs n3, d3, n3 + +// Now if the carry is *clear* (inversion at work) the subtraction carried +// and hence we should have done nothing, so we reset each n_i = d_i + + csel n0, d0, n0, cc + csel n1, d1, n1, cc + csel n2, d2, n2, cc + csel n3, d3, n3, cc + +// Store the end result + + stp n0, n1, [z] + stp n2, n3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montinv_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montinv_sm2.S new file mode 100644 index 00000000000..fbcb136911a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montinv_sm2.S @@ -0,0 +1,1290 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1 +// Input x[4]; output z[4] +// +// extern void bignum_montinv_sm2(uint64_t z[static 4],uint64_t x[static 4]); +// +// If the 4-digit input x is coprime to p_sm2, i.e. is not divisible +// by it, returns z < p_sm2 such that x * z == 2^512 (mod p_sm2). This +// is effectively "Montgomery inverse" because if we consider x and z as +// Montgomery forms of X and Z, i.e. x == 2^256 * X and z == 2^256 * Z +// (both mod p_sm2) then X * Z == 1 (mod p_sm2). That is, this function +// gives the analog of the modular inverse bignum_inv_sm2 but with both +// input and output in the Montgomery domain. Note that x does not need +// to be reduced modulo p_sm2, but the output always is. If the input +// is divisible (i.e. is 0 or p_sm2), then there can be no solution to +// the congruence x * z == 2^512 (mod p_sm2), and z = 0 is returned. + +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_sm2) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Used for the return pointer + +#define res x20 + +// Loop counter and d = 2 * delta value for divstep + +#define i x21 +#define d x22 + +// Registers used for matrix element magnitudes and signs + +#define m00 x10 +#define m01 x11 +#define m10 x12 +#define m11 x13 +#define s00 x14 +#define s01 x15 +#define s10 x16 +#define s11 x17 + +// Initial carries for combinations + +#define car0 x9 +#define car1 x19 + +// Input and output, plain registers treated according to pattern + +#define reg0 x0, #0 +#define reg1 x1, #0 +#define reg2 x2, #0 +#define reg3 x3, #0 +#define reg4 x4, #0 + +#define x x1, #0 +#define z x0, #0 + +// Pointer-offset pairs for temporaries on stack + +#define f sp, #0 +#define g sp, #(6*N) +#define u sp, #(12*N) +#define v sp, #(16*N) + +// Total size to reserve on the stack + +#define NSPACE #(20*N) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro. Takes input in +// [d4;d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to +// the existing [d4;d3;d2;d1], and re-using d0 as a temporary internally +// as well as t0, t1, t2, t3. This is almost-Montgomery, i.e. the result +// fits in 4 digits but is not necessarily strictly reduced mod p_sm2. +// --------------------------------------------------------------------------- + +#define amontred(d4,d3,d2,d1,d0, t3,t2,t1,t0) \ +/* We only know the input is -2^316 < x < 2^316. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_sm2. */ \ + mov t0, #0xe000000000000000 __LF \ + adds d0, d0, t0 __LF \ + mov t1, #0x1fffffffffffffff __LF \ + adcs d1, d1, t1 __LF \ + mov t2, #0xffffffffe0000000 __LF \ + adcs d2, d2, t2 __LF \ + sbcs d3, d3, xzr __LF \ + and t0, t1, #0xffffffffdfffffff __LF \ + adc d4, d4, t0 __LF \ +/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0 */ \ + lsl t2, d0, #32 __LF \ + lsr t3, d0, #32 __LF \ + subs t0, t2, d0 __LF \ + sbc t1, t3, xzr __LF \ +/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0] */ \ + subs d1, d1, t0 __LF \ + sbcs d2, d2, t1 __LF \ + sbcs d3, d3, t2 __LF \ + sbc t0, d0, t3 __LF \ + adds d4, d4, t0 __LF \ +/* Now capture top carry and subtract p_sm2 if set (almost-Montgomery) */ \ + csetm t0, cs __LF \ + subs d1, d1, t0 __LF \ + and t1, t0, #0xffffffff00000000 __LF \ + sbcs d2, d2, t1 __LF \ + and t2, t0, #0xfffffffeffffffff __LF \ + sbcs d3, d3, t0 __LF \ + sbc d4, d4, t2 + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix in +// registers as follows +// +// [ m00 m01] +// [ m10 m11] + +#define divstep59() \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x8, x4, #0x100, lsl #12 __LF \ + sbfx x8, x8, #21, #21 __LF \ + mov x11, #0x100000 __LF \ + add x11, x11, x11, lsl #21 __LF \ + add x9, x4, x11 __LF \ + asr x9, x9, #42 __LF \ + add x10, x5, #0x100, lsl #12 __LF \ + sbfx x10, x10, #21, #21 __LF \ + add x11, x5, x11 __LF \ + asr x11, x11, #42 __LF \ + mul x6, x8, x2 __LF \ + mul x7, x9, x3 __LF \ + mul x2, x10, x2 __LF \ + mul x3, x11, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #21, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #42 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #21, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #42 __LF \ + mul x6, x12, x2 __LF \ + mul x7, x13, x3 __LF \ + mul x2, x14, x2 __LF \ + mul x3, x15, x3 __LF \ + add x4, x6, x7 __LF \ + add x5, x2, x3 __LF \ + asr x2, x4, #20 __LF \ + asr x3, x5, #20 __LF \ + and x4, x2, #0xfffff __LF \ + orr x4, x4, #0xfffffe0000000000 __LF \ + and x5, x3, #0xfffff __LF \ + orr x5, x5, #0xc000000000000000 __LF \ + tst x5, #0x1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + mul x2, x12, x8 __LF \ + mul x3, x12, x9 __LF \ + mul x6, x14, x8 __LF \ + mul x7, x14, x9 __LF \ + madd x8, x13, x10, x2 __LF \ + madd x9, x13, x11, x3 __LF \ + madd x16, x15, x10, x6 __LF \ + madd x17, x15, x11, x7 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + tst x5, #0x2 __LF \ + asr x5, x5, #1 __LF \ + csel x6, x4, xzr, ne __LF \ + ccmp x1, xzr, #0x8, ne __LF \ + cneg x1, x1, ge __LF \ + cneg x6, x6, ge __LF \ + csel x4, x5, x4, ge __LF \ + add x5, x5, x6 __LF \ + add x1, x1, #0x2 __LF \ + asr x5, x5, #1 __LF \ + add x12, x4, #0x100, lsl #12 __LF \ + sbfx x12, x12, #22, #21 __LF \ + mov x15, #0x100000 __LF \ + add x15, x15, x15, lsl #21 __LF \ + add x13, x4, x15 __LF \ + asr x13, x13, #43 __LF \ + add x14, x5, #0x100, lsl #12 __LF \ + sbfx x14, x14, #22, #21 __LF \ + add x15, x5, x15 __LF \ + asr x15, x15, #43 __LF \ + mneg x2, x12, x8 __LF \ + mneg x3, x12, x9 __LF \ + mneg x4, x14, x8 __LF \ + mneg x5, x14, x9 __LF \ + msub m00, x13, x16, x2 __LF \ + msub m01, x13, x17, x3 __LF \ + msub m10, x15, x16, x4 __LF \ + msub m11, x15, x17, x5 + +S2N_BN_SYMBOL(bignum_montinv_sm2): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Copy the prime and input into the main f and g variables respectively. +// Make sure x is reduced so that g <= f as assumed in the bound proof. + + mov x10, #0xffffffffffffffff + mov x11, #0xffffffff00000000 + mov x13, #0xfffffffeffffffff + stp x10, x11, [f] + stp x10, x13, [f+2*N] + str xzr, [f+4*N] + + ldp x2, x3, [x1] + subs x10, x2, #-1 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #(2*N)] + adcs x12, x4, xzr + sbcs x13, x5, x13 + + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + + stp x2, x3, [g] + stp x4, x5, [g+2*N] + str xzr, [g+4*N] + +// Also maintain reduced < 2^256 vector [u,v] such that +// [f,g] == x * 2^{5*i-50} * [u,v] (mod p_sm2) +// starting with [p_sm2,x] == x * 2^{5*0-562} * [0,2^562] (mod p_sm2) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. +// After the 10th and last iteration and sign adjustment, when +// f == 1 for in-scope cases, we have x * 2^{50-562} * u == 1, i.e. +// x * u == 2^512 as required. + + stp xzr, xzr, [u] + stp xzr, xzr, [u+2*N] + +// The starting constant 2^562 mod p_sm2 is +// 0x0018000000040000:0x00040000000c0000:0x000bfffffff80000:0x000c000000100000 +// where colons separate 64-bit subwords, least significant at the right. +// These each need a couple of instructions to create on ARM + + mov x10, #0x0000000000100000 + orr x10, x10, #0x000c000000000000 + mov x11, #0x000c000000000000 + sub x11, x11, #0x80000 + stp x10, x11, [v] + mov x12, #0x0004000000000000 + orr x12, x12, #0x00000000000c0000 + mov x13, #0x0018000000000000 + orr x13, x13, #0x0000000000040000 + stp x12, x13, [v+2*N] + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special tenth iteration after a uniform +// first 9. + + mov i, #10 + mov d, #1 + b bignum_montinv_sm2_midloop + +bignum_montinv_sm2_loop: + +// Separate the matrix elements into sign-magnitude pairs + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in stable registers for the [u,v] part and do [f,g] first. + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + + and x0, m10, s10 + and x1, m11, s11 + add car1, x0, x1 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + ldr x7, [f] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [g] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + +// Digit 1 of [f,g] + + ldr x7, [f+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g] + +// Digit 2 of [f,g] + + ldr x7, [f+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+N] + +// Digits 3 and 4 of [f,g] + + ldr x7, [f+3*N] + eor x1, x7, s00 + ldr x23, [f+4*N] + eor x3, x23, s00 + and x3, x3, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [g+3*N] + eor x1, x8, s01 + ldr x24, [g+4*N] + eor x0, x24, s01 + and x0, x0, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+2*N] + extr x5, x3, x5, #59 + str x5, [f+3*N] + asr x3, x3, #59 + str x3, [f+4*N] + + eor x1, x7, s10 + eor x5, x23, s10 + and x5, x5, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + eor x0, x24, s11 + and x0, x0, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [g+2*N] + extr x2, x5, x2, #59 + str x2, [g+3*N] + asr x5, x5, #59 + str x5, [g+4*N] + +// Now the computation of the updated u and v values and their +// Montgomery reductions. A very similar accumulation except that +// the top words of u and v are unsigned and we don't shift. +// +// Digit 0 of [u,v] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v] + adc x3, x3, x1 + +// Digit 1 of [u,v] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+N] + adc x4, x4, x1 + +// Digit 2 of [u,v] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+2*N] + adc x2, x2, x1 + +// Digits 3 and 4 of u (top is unsigned) + + ldr x7, [u+3*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Montgomery reduction of u + + ldp x0, x1, [u] + ldr x6, [u+2*N] + amontred(x3,x5,x6,x1,x0, x24,x10,x11,x14) + stp x1, x6, [u] + stp x5, x3, [u+16] + +// Digits 3 and 4 of v (top is unsigned) + + eor x1, x7, s10 + and x5, s10, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + and x0, s11, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + +// Montgomery reduction of v + + ldp x0, x1, [v] + ldr x3, [v+2*N] + amontred(x5,x2,x3,x1,x0, x24,x10,x11,x14) + stp x1, x3, [v] + stp x2, x5, [v+16] + +bignum_montinv_sm2_midloop: + + mov x1, d + ldr x2, [f] + ldr x3, [g] + divstep59() + mov d, x1 + +// Next iteration + + subs i, i, #1 + bne bignum_montinv_sm2_loop + +// The 10th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + ldr x0, [f] + ldr x1, [g] + mul x0, x0, m00 + madd x1, x1, m01, x0 + asr x0, x1, #63 + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * 2^{-512} [u,v] (mod p_sm2) +// we want to flip the sign of u according to that of f. + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + eor s00, s00, x0 + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + eor s01, s01, x0 + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + eor s10, s10, x0 + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + eor s11, s11, x0 + +// Adjust the initial value to allow for complement instead of negation + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + +// Digit 0 of [u] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + +// Digit 1 of [u] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + +// Digit 2 of [u] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + +// Digits 3 and 4 of u (top is unsigned) + + ldr x7, [u+3*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Montgomery reduction of u. This needs to be strict not "almost" +// so it is followed by an optional subtraction of p_sm2 + + ldp x0, x1, [u] + ldr x2, [u+2*N] + amontred(x3,x5,x2,x1,x0, x24,x10,x11,x14) + + mov x10, #0xffffffffffffffff + subs x10, x1, #-1 + mov x11, #0xffffffff00000000 + sbcs x11, x2, x11 + mov x13, #0xfffffffeffffffff + adcs x12, x5, xzr + sbcs x13, x3, x13 + + csel x10, x1, x10, cc + csel x11, x2, x11, cc + csel x12, x5, x12, cc + csel x13, x3, x13, cc + +// Store it back to the final output + + stp x10, x11, [res] + stp x12, x13, [res, #16] + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montmul_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montmul_sm2.S new file mode 100644 index 00000000000..f2595cb7ec5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montmul_sm2.S @@ -0,0 +1,267 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_sm2 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_sm2 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_sm2, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_sm2 (in particular this is true if we are in +// the "usual" case x < p_sm2 and y < p_sm2). +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_sm2) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z) +// c,h,l,t should all be different +// t,h should not overlap w,z +// --------------------------------------------------------------------------- + +#define muldiffn(c,h,l, t, x,y, w,z) \ + subs t, x, y __LF \ + cneg t, t, cc __LF \ + csetm c, cc __LF \ + subs h, w, z __LF \ + cneg h, h, cc __LF \ + mul l, t, h __LF \ + umulh h, t, h __LF \ + cinv c, c, cc __LF \ + eor l, l, c __LF \ + eor h, h, c + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1], and using t0, t1, t2 and t3 as +// temporaries. It is fine for d4 to be the same register as d0, +// and it often is. +// --------------------------------------------------------------------------- + +#define montreds(d4,d3,d2,d1,d0, t3,t2,t1,t0) \ +/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0 */ \ + lsl t2, d0, #32 __LF \ + lsr t3, d0, #32 __LF \ + subs t0, t2, d0 __LF \ + sbc t1, t3, xzr __LF \ +/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0] */ \ + subs d1, d1, t0 __LF \ + sbcs d2, d2, t1 __LF \ + sbcs d3, d3, t2 __LF \ + sbc d4, d0, t3 + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 +#define b0 x7 +#define b1 x8 +#define b2 x9 +#define b3 x10 + +#define s0 x11 +#define s1 x12 +#define s2 x13 +#define s3 x14 +#define t0 x15 +#define t1 x16 +#define t2 x17 +#define t3 x1 +#define s4 x2 + +S2N_BN_SYMBOL(bignum_montmul_sm2): + +// Load in all words of both inputs + + ldp a0, a1, [x1] + ldp a2, a3, [x1, #16] + ldp b0, b1, [x2] + ldp b2, b3, [x2, #16] + +// Multiply low halves with a 2x2->4 ADK multiplier as L = [s3;s2;s1;s0] + + mul s0, a0, b0 + mul s2, a1, b1 + umulh s1, a0, b0 + adds t1, s0, s2 + umulh s3, a1, b1 + adcs t2, s1, s3 + adcs s3, s3, xzr + adds s1, s1, t1 + adcs s2, s2, t2 + adcs s3, s3, xzr + muldiffn(t3,t2,t1, t0, a0,a1, b1,b0) + adds xzr, t3, #1 + adcs s1, s1, t1 + adcs s2, s2, t2 + adc s3, s3, t3 + +// Perform two "short" Montgomery steps on the low product to +// get a modified low result L' = [s1;s0;s3;s2] +// This shifts it to an offset compatible with middle terms +// Stash the result L' temporarily in the output buffer to avoid +// using additional registers. + + montreds(s0,s3,s2,s1,s0, t0,t1,t2,t3) + montreds(s1,s0,s3,s2,s1, t0,t1,t2,t3) + + stp s2, s3, [x0] + stp s0, s1, [x0, #16] + +// Multiply high halves with a 2x2->4 ADK multiplier as H = [s3;s2;s1;s0] + + mul s0, a2, b2 + mul s2, a3, b3 + umulh s1, a2, b2 + adds t1, s0, s2 + umulh s3, a3, b3 + adcs t2, s1, s3 + adcs s3, s3, xzr + adds s1, s1, t1 + adcs s2, s2, t2 + adcs s3, s3, xzr + muldiffn(t3,t2,t1, t0, a2,a3, b3,b2) + adds xzr, t3, #1 + adcs s1, s1, t1 + adcs s2, s2, t2 + adc s3, s3, t3 + +// Compute sign-magnitude a2,[a1,a0] = x_hi - x_lo + + subs a0, a2, a0 + sbcs a1, a3, a1 + sbc a2, xzr, xzr + adds xzr, a2, #1 + eor a0, a0, a2 + adcs a0, a0, xzr + eor a1, a1, a2 + adcs a1, a1, xzr + +// Compute sign-magnitude b2,[b1,b0] = y_lo - y_hi + + subs b0, b0, b2 + sbcs b1, b1, b3 + sbc b2, xzr, xzr + adds xzr, b2, #1 + eor b0, b0, b2 + adcs b0, b0, xzr + eor b1, b1, b2 + adcs b1, b1, xzr + +// Save the correct sign for the sub-product in b3 + + eor b3, a2, b2 + +// Add the high H to the modified low term L' as H + L' = [s4;b2;a2;t3;t0] + + ldp t0, t3, [x0] + adds t0, s0, t0 + adcs t3, s1, t3 + ldp a2, b2, [x0, #16] + adcs a2, s2, a2 + adcs b2, s3, b2 + adc s4, xzr, xzr + +// Multiply with yet a third 2x2->4 ADK multiplier for complex mid-term M + + mul s0, a0, b0 + mul s2, a1, b1 + umulh s1, a0, b0 + adds t1, s0, s2 + umulh s3, a1, b1 + adcs t2, s1, s3 + adcs s3, s3, xzr + adds s1, s1, t1 + adcs s2, s2, t2 + adcs s3, s3, xzr + muldiffn(a1,t2,t1, a0, a0,a1, b1,b0) + adds xzr, a1, #1 + adcs s1, s1, t1 + adcs s2, s2, t2 + adc s3, s3, a1 + +// Set up a sign-modified version of the mid-product in a long accumulator +// as [b3;a1;a0;s3;s2;s1;s0], adding in the H + L' term once with +// zero offset as this signed value is created + + adds xzr, b3, #1 + eor s0, s0, b3 + adcs s0, s0, t0 + eor s1, s1, b3 + adcs s1, s1, t3 + eor s2, s2, b3 + adcs s2, s2, a2 + eor s3, s3, b3 + adcs s3, s3, b2 + adcs a0, s4, b3 + adcs a1, b3, xzr + adc b3, b3, xzr + +// Add in the stashed H + L' term an offset of 2 words as well + + adds s2, s2, t0 + adcs s3, s3, t3 + adcs a0, a0, a2 + adcs a1, a1, b2 + adc b3, b3, s4 + +// Do two more Montgomery steps on the composed term +// Net pre-reduct is in [b3;a1;a0;s3;s2] + + montreds(s0,s3,s2,s1,s0, t0,t1,t2,t3) + montreds(s1,s0,s3,s2,s1, t0,t1,t2,t3) + + adds a0, a0, s0 + adcs a1, a1, s1 + adc b3, b3, xzr + +// Because of the way we added L' in two places, we can overspill by +// more than usual in Montgomery, with the result being only known to +// be < 3 * p_sm2, not the usual < 2 * p_sm2. So now we do a more +// elaborate final correction in the style of bignum_cmul_sm2, though +// we can use much simpler quotient estimation logic (q = h + 1) and +// slightly more direct accumulation of p_sm2 * q. + +#define d0 s2 +#define d1 s3 +#define d2 a0 +#define d3 a1 +#define h b3 + +#define q s4 +#define c b0 + + add q, h, #1 + lsl t0, q, #32 + sub t1, t0, q + adds d0, d0, q + adcs d1, d1, t1 + adcs d2, d2, xzr + adcs d3, d3, t0 + csetm c, cc + adds d0, d0, c + and t1, c, #0xffffffff00000000 + adcs d1, d1, t1 + adcs d2, d2, c + and t0, c, #0xfffffffeffffffff + adc d3, d3, t0 + +// Finally store the result + + stp d0, d1, [x0] + stp d2, d3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montmul_sm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montmul_sm2_alt.S new file mode 100644 index 00000000000..a57f6c140d1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montmul_sm2_alt.S @@ -0,0 +1,204 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_sm2 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_sm2_alt +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_sm2, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_sm2 (in particular this is true if we are in +// the "usual" case x < p_sm2 and y < p_sm2). +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_sm2_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_sm2_alt) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1], and using t0, t1, t2 and t3 as +// temporaries. It is fine for d4 to be the same register as d0, +// and it often is. +// --------------------------------------------------------------------------- + +#define montreds(d4,d3,d2,d1,d0, t3,t2,t1,t0) \ +/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0 */ \ + lsl t2, d0, #32 __LF \ + lsr t3, d0, #32 __LF \ + subs t0, t2, d0 __LF \ + sbc t1, t3, xzr __LF \ +/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0] */ \ + subs d1, d1, t0 __LF \ + sbcs d2, d2, t1 __LF \ + sbcs d3, d3, t2 __LF \ + sbc d4, d0, t3 + +#define z x0 +#define x x1 +#define y x2 + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 +#define b0 x7 +#define b1 x8 +#define b2 x9 +#define b3 x10 + +#define l x11 + +#define u0 x12 +#define u1 x13 +#define u2 x14 +#define u3 x15 +#define u4 x16 + +// These alias to the input arguments when no longer needed + +#define u5 a0 +#define u6 a1 +#define u7 a2 +#define h a3 + +S2N_BN_SYMBOL(bignum_montmul_sm2_alt): + +// Load operands and set up row 0 = [u4;...;u0] = a0 * [b3;...;b0] + + ldp a0, a1, [x] + ldp b0, b1, [y] + + mul u0, a0, b0 + umulh u1, a0, b0 + mul l, a0, b1 + umulh u2, a0, b1 + adds u1, u1, l + + ldp b2, b3, [y, #16] + + mul l, a0, b2 + umulh u3, a0, b2 + adcs u2, u2, l + + mul l, a0, b3 + umulh u4, a0, b3 + adcs u3, u3, l + adc u4, u4, xzr + + ldp a2, a3, [x, #16] + +// Row 1 = [u5;...;u0] = [a1;a0] * [b3;...;b0] + + mul l, a1, b0 + adds u1, u1, l + mul l, a1, b1 + adcs u2, u2, l + mul l, a1, b2 + adcs u3, u3, l + mul l, a1, b3 + adcs u4, u4, l + umulh u5, a1, b3 + adc u5, u5, xzr + + umulh l, a1, b0 + adds u2, u2, l + umulh l, a1, b1 + adcs u3, u3, l + umulh l, a1, b2 + adcs u4, u4, l + adc u5, u5, xzr + +// Row 2 = [u6;...;u0] = [a2;a1;a0] * [b3;...;b0] + + mul l, a2, b0 + adds u2, u2, l + mul l, a2, b1 + adcs u3, u3, l + mul l, a2, b2 + adcs u4, u4, l + mul l, a2, b3 + adcs u5, u5, l + umulh u6, a2, b3 + adc u6, u6, xzr + + umulh l, a2, b0 + adds u3, u3, l + umulh l, a2, b1 + adcs u4, u4, l + umulh l, a2, b2 + adcs u5, u5, l + adc u6, u6, xzr + +// Row 3 = [u7;...;u0] = [a3;...a0] * [b3;...;b0] + + mul l, a3, b0 + adds u3, u3, l + mul l, a3, b1 + adcs u4, u4, l + mul l, a3, b2 + adcs u5, u5, l + mul l, a3, b3 + adcs u6, u6, l + umulh u7, a3, b3 + adc u7, u7, xzr + + umulh l, a3, b0 + adds u4, u4, l + umulh l, a3, b1 + adcs u5, u5, l + umulh l, a3, b2 + adcs u6, u6, l + adc u7, u7, xzr + +// Perform 4 Montgomery steps to rotate the lower half + + montreds(u0,u3,u2,u1,u0, h,l,b0,b1) + montreds(u1,u0,u3,u2,u1, h,l,b0,b1) + montreds(u2,u1,u0,u3,u2, h,l,b0,b1) + montreds(u3,u2,u1,u0,u3, h,l,b0,b1) + +// Add high and low parts, catching carry in b1 + + adds u0, u0, u4 + adcs u1, u1, u5 + adcs u2, u2, u6 + adcs u3, u3, u7 + cset b1, cs + +// Set [h;-1;l;-1] = p_sm2 and form [u7,u6,u5,u4] = [b1;u3;u2;u1;u0] - p_sm2 + + mov l, #0xffffffff00000000 + mov h, #0xfffffffeffffffff + + subs u4, u0, #-1 + sbcs u5, u1, l + adcs u6, u2, xzr + sbcs u7, u3, h + sbcs xzr, b1, xzr + +// Now CF is clear if the comparison carried so the original was fine +// Otherwise take the form with p_sm2 subtracted. + + csel u0, u0, u4, cc + csel u1, u1, u5, cc + csel u2, u2, u6, cc + csel u3, u3, u7, cc + +// Store back final result + + stp u0, u1, [z] + stp u2, u3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montsqr_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montsqr_sm2.S new file mode 100644 index 00000000000..3c715a176a1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montsqr_sm2.S @@ -0,0 +1,268 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_sm2, assuming x^2 <= 2^256 * p_sm2, which is +// guaranteed in particular if x < p_sm2 initially (the "intended" case). +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_sm2) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Macro returning (c,h,l) = 3-word 1s complement (x - y) * (w - z) +// c,h,l,t should all be different +// t,h should not overlap w,z +// --------------------------------------------------------------------------- + +#define muldiffn(c,h,l, t, x,y, w,z) \ + subs t, x, y __LF \ + cneg t, t, cc __LF \ + csetm c, cc __LF \ + subs h, w, z __LF \ + cneg h, h, cc __LF \ + mul l, t, h __LF \ + umulh h, t, h __LF \ + cinv c, c, cc __LF \ + eor l, l, c __LF \ + eor h, h, c + +// --------------------------------------------------------------------------- +// Core one-step "end" Montgomery reduction macro. Takes input in +// [d5;d4;d3;d2;d1;d0] and returns result in [d5;d4;d3;d2;d1], adding to +// the existing [d4;d3;d2;d1], re-using d0 as a temporary internally as well +// as t1, t2, t3, and initializing d5 from zero (hence "end"). +// --------------------------------------------------------------------------- + +#define montrede(d5, d4,d3,d2,d1,d0, t3,t2,t1,t0) \ +/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0 */ \ + lsl t2, d0, #32 __LF \ + lsr t3, d0, #32 __LF \ + subs t0, t2, d0 __LF \ + sbc t1, t3, xzr __LF \ +/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0] */ \ + subs d1, d1, t0 __LF \ + sbcs d2, d2, t1 __LF \ + sbcs d3, d3, t2 __LF \ + sbc t0, d0, t3 __LF \ + adds d4, d4, t0 __LF \ + adc d5, xzr, xzr + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1], and using t0, t1, t2 and t3 as +// temporaries. It is fine for d4 to be the same register as d0, +// and it often is. +// --------------------------------------------------------------------------- + +#define montreds(d4,d3,d2,d1,d0, t3,t2,t1,t0) \ +/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0 */ \ + lsl t2, d0, #32 __LF \ + lsr t3, d0, #32 __LF \ + subs t0, t2, d0 __LF \ + sbc t1, t3, xzr __LF \ +/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0] */ \ + subs d1, d1, t0 __LF \ + sbcs d2, d2, t1 __LF \ + sbcs d3, d3, t2 __LF \ + sbc d4, d0, t3 + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 + +#define c0 x6 +#define c1 x7 +#define c2 x8 +#define c3 x9 +#define c4 x10 +#define d1 x11 +#define d2 x12 +#define d3 x13 +#define d4 x14 + +#define s0 x15 +#define s1 x16 +#define s2 x17 +#define s3 x1 + +#define a0short w2 +#define a1short w3 +#define d1short w11 + +S2N_BN_SYMBOL(bignum_montsqr_sm2): + +// Load in all words of the input + + ldp a0, a1, [x1] + ldp a2, a3, [x1, #16] + +// Square the low half, getting a result in [s3;s2;s1;s0] +// This uses 32x32->64 multiplications to reduce the number of UMULHs + + umull s0, a0short, a0short + lsr d1, a0, #32 + umull s1, d1short, d1short + umull d1, a0short, d1short + adds s0, s0, d1, lsl #33 + lsr d1, d1, #31 + adc s1, s1, d1 + umull s2, a1short, a1short + lsr d1, a1, #32 + umull s3, d1short, d1short + umull d1, a1short, d1short + mul d2, a0, a1 + umulh d3, a0, a1 + adds s2, s2, d1, lsl #33 + lsr d1, d1, #31 + adc s3, s3, d1 + adds d2, d2, d2 + adcs d3, d3, d3 + adc s3, s3, xzr + adds s1, s1, d2 + adcs s2, s2, d3 + adc s3, s3, xzr + +// Perform two "short" Montgomery steps on the low square +// This shifts it to an offset compatible with middle product + + montreds(s0,s3,s2,s1,s0, d1,d2,d3,d4) + + montreds(s1,s0,s3,s2,s1, d1,d2,d3,d4) + +// Compute cross-product with ADK 2x2->4 multiplier as [c3;c2;c1;c0] + + mul c0, a0, a2 + mul d4, a1, a3 + umulh c2, a0, a2 + muldiffn(d3,d2,d1, c4, a0,a1, a3,a2) + + adds c1, c0, c2 + adc c2, c2, xzr + + umulh c3, a1, a3 + + adds c1, c1, d4 + adcs c2, c2, c3 + adc c3, c3, xzr + adds c2, c2, d4 + adc c3, c3, xzr + + adds xzr, d3, #1 + adcs c1, c1, d1 + adcs c2, c2, d2 + adc c3, c3, d3 + +// Double it and add the Montgomerified low square + + adds c0, c0, c0 + adcs c1, c1, c1 + adcs c2, c2, c2 + adcs c3, c3, c3 + adc c4, xzr, xzr + + adds c0, c0, s2 + adcs c1, c1, s3 + adcs c2, c2, s0 + adcs c3, c3, s1 + adc c4, c4, xzr + +// Montgomery-reduce the combined low and middle term another twice + + montrede(c0,c4,c3,c2,c1,c0, d1,d2,d3,d4) + + montrede(c1,c0,c4,c3,c2,c1, d1,d2,d3,d4) + +// Our sum so far is in [c1,c0,c4,c3,c2]; choose more intuitive names + +#define r0 x8 +#define r1 x9 +#define r2 x10 +#define r3 x6 +#define c x7 + +// Remind ourselves what else we can't destroy + +#define a2 x4 +#define a3 x5 + +// So we can have these as temps + +#define t1 x11 +#define t2 x12 +#define t3 x13 + +// Add in the pure squares 22 + 33 + + mul t1, a2, a2 + adds r0, r0, t1 + mul t2, a3, a3 + umulh t1, a2, a2 + adcs r1, r1, t1 + adcs r2, r2, t2 + umulh t2, a3, a3 + adcs r3, r3, t2 + adc c, c, xzr + +// Construct the 23 term, double and add it in + + mul t1, a2, a3 + umulh t2, a2, a3 + adds t1, t1, t1 + adcs t2, t2, t2 + adc t3, xzr, xzr + + adds r1, r1, t1 + adcs r2, r2, t2 + adcs r3, r3, t3 + adcs c, c, xzr + +// We know, writing B = 2^{4*64} that the full implicit result is +// B^2 c <= z + (B - 1) * p < B * p + (B - 1) * p < 2 * B * p, +// so the top half is certainly < 2 * p. If c = 1 already, we know +// subtracting p will give the reduced modulus. But now we do a +// subtraction-comparison to catch cases where the residue is >= p. +// The constants are such that [t3;0;t1;-1] = p_256. + +#define t0 x5 + +// Set CF (because of inversion) iff (0,p_256) <= (c,r3,r2,r1,r0) + + mov t1, #0xffffffff00000000 + subs t0, r0, #-1 + sbcs t1, r1, t1 + mov t3, #0xfffffffeffffffff + adcs t2, r2, xzr + sbcs t3, r3, t3 + sbcs xzr, c, xzr + +// Select final output accordingly + + csel r0, t0, r0, cs + csel r1, t1, r1, cs + csel r2, t2, r2, cs + csel r3, t3, r3, cs + +// Store things back in place + + stp r0, r1, [x0] + stp r2, r3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montsqr_sm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montsqr_sm2_alt.S new file mode 100644 index 00000000000..f2e871b3857 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_montsqr_sm2_alt.S @@ -0,0 +1,178 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_sm2_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_sm2, assuming x^2 <= 2^256 * p_sm2, which is +// guaranteed in particular if x < p_sm2 initially (the "intended" case). +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_sm2_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_sm2_alt) + .text + .balign 4 + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1], and using t0, t1, t2 and t3 as +// temporaries. It is fine for d4 to be the same register as d0, +// and it often is. +// --------------------------------------------------------------------------- + +#define montreds(d4,d3,d2,d1,d0, t3,t2,t1,t0) \ +/* First let [t3;t2] = 2^32 * d0 and [t1;t0] = (2^32-1) * d0 */ \ + lsl t2, d0, #32 __LF \ + lsr t3, d0, #32 __LF \ + subs t0, t2, d0 __LF \ + sbc t1, t3, xzr __LF \ +/* Now [d4;d3;d2;d1] := [d0;d3;d2;d1] - [t3;t2;t1;t0] */ \ + subs d1, d1, t0 __LF \ + sbcs d2, d2, t1 __LF \ + sbcs d3, d3, t2 __LF \ + sbc d4, d0, t3 + +#define z x0 +#define x x1 + +#define a0 x2 +#define a1 x3 +#define a2 x4 +#define a3 x5 + +#define l x6 +#define h x7 + +#define u0 x8 +#define u1 x9 +#define u2 x10 +#define u3 x11 +#define u4 x12 +#define u5 x13 +#define u6 x14 + +// This one is the same as h, which is safe with this computation sequence + +#define u7 h + +S2N_BN_SYMBOL(bignum_montsqr_sm2_alt): + +// Load all the elements, set up an initial window [u6;...u1] = [23;03;01] +// and chain in the addition of 02 + 12 + 13 (no carry-out is possible). +// This gives all the "heterogeneous" terms of the squaring ready to double + + ldp a0, a1, [x] + + mul u1, a0, a1 + umulh u2, a0, a1 + + ldp a2, a3, [x, #16] + + mul u3, a0, a3 + umulh u4, a0, a3 + + mul l, a0, a2 + umulh h, a0, a2 + adds u2, u2, l + + adcs u3, u3, h + mul l, a1, a2 + umulh h, a1, a2 + adc h, h, xzr + adds u3, u3, l + + mul u5, a2, a3 + umulh u6, a2, a3 + + adcs u4, u4, h + mul l, a1, a3 + umulh h, a1, a3 + adc h, h, xzr + adds u4, u4, l + + adcs u5, u5, h + adc u6, u6, xzr + +// Now just double it; this simple approach seems to work better than extr + + adds u1, u1, u1 + adcs u2, u2, u2 + adcs u3, u3, u3 + adcs u4, u4, u4 + adcs u5, u5, u5 + adcs u6, u6, u6 + cset u7, cs + +// Add the homogeneous terms 00 + 11 + 22 + 33 + + umulh l, a0, a0 + mul u0, a0, a0 + adds u1, u1, l + + mul l, a1, a1 + adcs u2, u2, l + umulh l, a1, a1 + adcs u3, u3, l + + mul l, a2, a2 + adcs u4, u4, l + umulh l, a2, a2 + adcs u5, u5, l + + mul l, a3, a3 + adcs u6, u6, l + umulh l, a3, a3 + adc u7, u7, l + +// Squaring complete. Perform 4 Montgomery steps to rotate the lower half + + montreds(u0,u3,u2,u1,u0, a3,a2,a1,a0) + montreds(u1,u0,u3,u2,u1, a3,a2,a1,a0) + montreds(u2,u1,u0,u3,u2, a3,a2,a1,a0) + montreds(u3,u2,u1,u0,u3, a3,a2,a1,a0) + +// Add high and low parts, catching carry in a0 + + adds u0, u0, u4 + adcs u1, u1, u5 + adcs u2, u2, u6 + adcs u3, u3, u7 + cset a0, cs + +// Set [a3;-1;a1;-1] = p_sm2 and form [u7,u6,u5,u4] = [a0;u3;u2;u1;u0] - p_sm2 + + mov a1, #0xffffffff00000000 + mov a3, #0xfffffffeffffffff + + subs u4, u0, #-1 + sbcs u5, u1, a1 + adcs u6, u2, xzr + sbcs u7, u3, a3 + sbcs xzr, a0, xzr + +// Now CF is clear if the comparison carried so the original was fine +// Otherwise take the form with p_sm2 subtracted. + + csel u0, u0, u4, cc + csel u1, u1, u5, cc + csel u2, u2, u6, cc + csel u3, u3, u7, cc + +// Store back final result + + stp u0, u1, [z] + stp u2, u3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_neg_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_neg_sm2.S new file mode 100644 index 00000000000..e91f73ddfce --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_neg_sm2.S @@ -0,0 +1,66 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Negate modulo p_sm2, z := (-x) mod p_sm2, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_neg_sm2 (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_neg_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_neg_sm2) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define p x2 +#define t x3 + +#define d0 x4 +#define d1 x5 +#define d2 x6 +#define d3 x7 + +S2N_BN_SYMBOL(bignum_neg_sm2): + +// Load the 4 digits of x + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Set a bitmask p for the input being nonzero, so that we avoid doing +// -0 = p_sm2 and hence maintain strict modular reduction + + orr t, d0, d1 + orr p, d2, d3 + orr p, p, t + cmp p, #0 + csetm p, ne + +// Mask nontrivial words of p_sm2 = [n3;-1;n1;-1] and subtract + + subs d0, p, d0 + and t, p, #0xffffffff00000000 + sbcs d1, t, d1 + sbcs d2, p, d2 + and t, p, #0xfffffffeffffffff + sbc d3, t, d3 + +// Write back the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + +// Return + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_optneg_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_optneg_sm2.S new file mode 100644 index 00000000000..5b977fda1f9 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_optneg_sm2.S @@ -0,0 +1,83 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally negate modulo p_sm2, z := (-x) mod p_sm2 (if p nonzero) or +// z := x (if p zero), assuming x reduced +// Inputs p, x[4]; output z[4] +// +// extern void bignum_optneg_sm2 +// (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = p, X2 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_sm2) + .text + .balign 4 + +#define z x0 +#define p x1 +#define x x2 + +#define d0 x3 +#define d1 x4 +#define d2 x5 +#define d3 x6 +#define n0 x7 +#define n1 x8 +#define n2 x9 +#define n3 x10 + +S2N_BN_SYMBOL(bignum_optneg_sm2): + +// Load the 4 digits of x + + ldp d0, d1, [x] + ldp d2, d3, [x, #16] + +// Adjust p by zeroing it if the input is zero (to avoid giving -0 = p, which +// is not strictly reduced even though it's correct modulo p) + + orr n0, d0, d1 + orr n1, d2, d3 + orr n2, n0, n1 + cmp n2, #0 + csel p, xzr, p, eq + +// Load the nontrivial words of p_sm2 = [n3;-1;n1;-1] + + mov n2, #0xffffffffffffffff + mov n1, #0xffffffff00000000 + mov n3, #0xfffffffeffffffff + +// Do the subtraction, which by hypothesis does not underflow + + subs n0, n2, d0 + sbcs n1, n1, d1 + sbcs n2, n2, d2 + sbc n3, n3, d3 + +// Set condition code if original x is nonzero and p was nonzero + + cmp p, #0 + +// Hence multiplex and write back + + csel n0, n0, d0, ne + csel n1, n1, d1, ne + csel n2, n2, d2, ne + csel n3, n3, d3, ne + + stp n0, n1, [z] + stp n2, n3, [z, #16] + +// Return + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_sub_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_sub_sm2.S new file mode 100644 index 00000000000..38467a4fd05 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_sub_sm2.S @@ -0,0 +1,67 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Subtract modulo p_sm2, z := (x - y) mod p_sm2 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_sub_sm2 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x, X2 = y +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_sm2) + .text + .balign 4 + +#define z x0 +#define x x1 +#define y x2 +#define c x3 +#define l x4 +#define d0 x5 +#define d1 x6 +#define d2 x7 +#define d3 x8 + + +S2N_BN_SYMBOL(bignum_sub_sm2): + +// First just subtract the numbers as [d3; d2; d1; d0] +// Set a mask based on (inverted) carry indicating x < y = correction is needed + + ldp d0, d1, [x] + ldp l, c, [y] + subs d0, d0, l + sbcs d1, d1, c + ldp d2, d3, [x, #16] + ldp l, c, [y, #16] + sbcs d2, d2, l + sbcs d3, d3, c + +// Create a mask for the condition x < y, when we need to correct + + csetm c, cc + +// Now correct by adding masked p_sm2 + + adds d0, d0, c + and l, c, #0xffffffff00000000 + adcs d1, d1, l + adcs d2, d2, c + and l, c, #0xfffffffeffffffff + adc d3, d3, l + +// Store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_tomont_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_tomont_sm2.S new file mode 100644 index 00000000000..d5bfb407e68 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_tomont_sm2.S @@ -0,0 +1,108 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert to Montgomery form z := (2^256 * x) mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_tomont_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_sm2) + .text + .balign 4 + +// ---------------------------------------------------------------------------- +// Core "x |-> (2^64 * x) mod p_sm2" macro, with x assumed to be < p_sm2. +// We write it as a macro to be repeated instead of using .rep in assembler. +// The code here is very similar to the core of bignum_mod_sm2, just +// implicitly inserting zeros instead of fresh digits. +// ---------------------------------------------------------------------------- + +#define modstep_sm2() \ +/* Writing the input, with a lowest zero digit appended, as */ \ +/* z = 2^256 * d3 + 2^192 * d2 + t, quotient approximation is */ \ +/* MIN ((d3 * (1 + 2^32 + 2^64) + d2 + 2^64) >> 64) (2^64 - 1) */ \ + adds t3, d2, d3 __LF \ + mov t2, #1 __LF \ + adc t1, d3, t2 __LF \ + add t2, d3, t3, lsr #32 __LF \ + adds q, t1, t2, lsr #32 __LF \ + cinv q, q, cs __LF \ +/* Let t3 = q<<32 and t4 = q>>32 then [t2;t1] = 2^32 * q - q */ \ + lsl t3, q, #32 __LF \ + subs t1, t3, q __LF \ + lsr t4, q, #32 __LF \ + sbc t2, t4, xzr __LF \ +/* Do the basic correction [t4;t3;t2;t1;q] = 2^256 * x - q * p */ \ + adds t1, t1, d0 __LF \ + sub d3, d3, q __LF \ + adcs t2, t2, d1 __LF \ + adcs t3, t3, d2 __LF \ + adc t4, t4, d3 __LF \ +/* Use top word as mask to correct */ \ + adds d0, q, t4 __LF \ + and t0, t4, #0xffffffff00000000 __LF \ + adcs d1, t1, t0 __LF \ + adcs d2, t2, t4 __LF \ + and t0, t4, #0xfffffffeffffffff __LF \ + adc d3, t3, t0 + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 + +#define t1 x6 +#define t2 x7 +#define t3 x8 + +#define t4 x9 + +#define q x1 +#define t0 x1 + +S2N_BN_SYMBOL(bignum_tomont_sm2): + +// Load the input + + ldp d0, d1, [x1] + ldp d2, d3, [x1, #16] + +// Do an initial reduction to make sure this is < p_sm2, using just +// a copy of the bignum_mod_sm2_4 code. This is needed to set up the +// invariant "input < p_sm2" for the main modular reduction steps. + + subs t0, d0, #-1 + mov t1, #0xffffffff00000000 + sbcs t1, d1, t1 + adcs t2, d2, xzr + mov t3, #0xfffffffeffffffff + sbcs t3, d3, t3 + csel d0, d0, t0, cc + csel d1, d1, t1, cc + csel d2, d2, t2, cc + csel d3, d3, t3, cc + +// Now do 4 iterations of a basic x |-> (2^64 * x) mod p_sm2 step. + + modstep_sm2() + modstep_sm2() + modstep_sm2() + modstep_sm2() + +// Store the result and return + + stp d0, d1, [x0] + stp d2, d3, [x0, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_triple_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_triple_sm2.S new file mode 100644 index 00000000000..3811fc9ef99 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/bignum_triple_sm2.S @@ -0,0 +1,107 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Triple modulo p_sm2, z := (3 * x) mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_triple_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The input x can be any 4-digit bignum, not necessarily reduced modulo p_sm2, +// and the result is always fully reduced, i.e. z = (3 * x) mod p_sm2. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_sm2) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_sm2_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_sm2_alt) + .text + .balign 4 + +#define z x0 +#define x x1 + +#define d0 x2 +#define d1 x3 +#define d2 x4 +#define d3 x5 +#define h x6 + +// Slightly offset aliases for the d_i for readability. + +#define a0 x3 +#define a1 x4 +#define a2 x5 +#define a3 x6 + +// More aliases for the same thing at different stages + +#define q x6 +#define c x6 + +// Other temporary variables + +#define t0 x7 +#define t1 x8 + +S2N_BN_SYMBOL(bignum_triple_sm2): + +S2N_BN_SYMBOL(bignum_triple_sm2_alt): + +// Load the inputs + + ldp a0, a1, [x] + ldp a2, a3, [x, #16] + +// First do the multiplication by 3, getting z = [h; d3; ...; d0] + + lsl d0, a0, #1 + adds d0, d0, a0 + extr d1, a1, a0, #63 + adcs d1, d1, a1 + extr d2, a2, a1, #63 + adcs d2, d2, a2 + extr d3, a3, a2, #63 + adcs d3, d3, a3 + lsr h, a3, #63 + adc h, h, xzr + +// For this limited range a simple quotient estimate of q = h + 1 works, where +// h = floor(z / 2^256). Then -p_sm2 <= z - q * p_sm2 < p_sm2, so we just need +// to subtract q * p_sm2 and then if that's negative, add back p_sm2. + + add q, h, #1 + +// Initial subtraction of z - q * p_sm2, with bitmask c for the carry + + lsl t0, q, #32 + sub t1, t0, q + adds d0, d0, q + adcs d1, d1, t1 + adcs d2, d2, xzr + adcs d3, d3, t0 + csetm c, cc + +// Use the bitmask c for final masked addition of p_sm2. + + adds d0, d0, c + and t1, c, #0xffffffff00000000 + adcs d1, d1, t1 + adcs d2, d2, c + and t0, c, #0xfffffffeffffffff + adc d3, d3, t0 + +// Finally store the result + + stp d0, d1, [z] + stp d2, d3, [z, #16] + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjadd.S new file mode 100644 index 00000000000..ac916cc547b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjadd.S @@ -0,0 +1,540 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates +// +// extern void sm2_montjadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjadd) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x17 +#define input_x x19 +#define input_y x20 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds to bignum_montmul_sm2 with x0 in place of x17 + +#define montmul_sm2(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2] __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x7 __LF \ + mul x13, x4, x8 __LF \ + umulh x12, x3, x7 __LF \ + adds x16, x11, x13 __LF \ + umulh x14, x4, x8 __LF \ + adcs x0, x12, x14 __LF \ + adcs x14, x14, xzr __LF \ + adds x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adcs x14, x14, xzr __LF \ + subs x15, x3, x4 __LF \ + cneg x15, x15, lo __LF \ + csetm x1, lo __LF \ + subs x0, x8, x7 __LF \ + cneg x0, x0, lo __LF \ + mul x16, x15, x0 __LF \ + umulh x0, x15, x0 __LF \ + cinv x1, x1, lo __LF \ + eor x16, x16, x1 __LF \ + eor x0, x0, x1 __LF \ + cmn x1, #1 __LF \ + adcs x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adc x14, x14, x1 __LF \ + lsl x16, x11, #32 __LF \ + lsr x15, x11, #32 __LF \ + subs x1, x16, x11 __LF \ + sbc x0, x15, xzr __LF \ + subs x12, x12, x1 __LF \ + sbcs x13, x13, x0 __LF \ + sbcs x14, x14, x16 __LF \ + sbc x11, x11, x15 __LF \ + lsl x16, x12, #32 __LF \ + lsr x15, x12, #32 __LF \ + subs x1, x16, x12 __LF \ + sbc x0, x15, xzr __LF \ + subs x13, x13, x1 __LF \ + sbcs x14, x14, x0 __LF \ + sbcs x11, x11, x16 __LF \ + sbc x12, x12, x15 __LF \ + stp x13, x14, [P0] __LF \ + stp x11, x12, [P0+16] __LF \ + mul x11, x5, x9 __LF \ + mul x13, x6, x10 __LF \ + umulh x12, x5, x9 __LF \ + adds x16, x11, x13 __LF \ + umulh x14, x6, x10 __LF \ + adcs x0, x12, x14 __LF \ + adcs x14, x14, xzr __LF \ + adds x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adcs x14, x14, xzr __LF \ + subs x15, x5, x6 __LF \ + cneg x15, x15, lo __LF \ + csetm x1, lo __LF \ + subs x0, x10, x9 __LF \ + cneg x0, x0, lo __LF \ + mul x16, x15, x0 __LF \ + umulh x0, x15, x0 __LF \ + cinv x1, x1, lo __LF \ + eor x16, x16, x1 __LF \ + eor x0, x0, x1 __LF \ + cmn x1, #1 __LF \ + adcs x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adc x14, x14, x1 __LF \ + subs x3, x5, x3 __LF \ + sbcs x4, x6, x4 __LF \ + ngc x5, xzr __LF \ + cmn x5, #1 __LF \ + eor x3, x3, x5 __LF \ + adcs x3, x3, xzr __LF \ + eor x4, x4, x5 __LF \ + adcs x4, x4, xzr __LF \ + subs x7, x7, x9 __LF \ + sbcs x8, x8, x10 __LF \ + ngc x9, xzr __LF \ + cmn x9, #1 __LF \ + eor x7, x7, x9 __LF \ + adcs x7, x7, xzr __LF \ + eor x8, x8, x9 __LF \ + adcs x8, x8, xzr __LF \ + eor x10, x5, x9 __LF \ + ldp x15, x1, [P0] __LF \ + adds x15, x11, x15 __LF \ + adcs x1, x12, x1 __LF \ + ldp x5, x9, [P0+16] __LF \ + adcs x5, x13, x5 __LF \ + adcs x9, x14, x9 __LF \ + adc x2, xzr, xzr __LF \ + mul x11, x3, x7 __LF \ + mul x13, x4, x8 __LF \ + umulh x12, x3, x7 __LF \ + adds x16, x11, x13 __LF \ + umulh x14, x4, x8 __LF \ + adcs x0, x12, x14 __LF \ + adcs x14, x14, xzr __LF \ + adds x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adcs x14, x14, xzr __LF \ + subs x3, x3, x4 __LF \ + cneg x3, x3, lo __LF \ + csetm x4, lo __LF \ + subs x0, x8, x7 __LF \ + cneg x0, x0, lo __LF \ + mul x16, x3, x0 __LF \ + umulh x0, x3, x0 __LF \ + cinv x4, x4, lo __LF \ + eor x16, x16, x4 __LF \ + eor x0, x0, x4 __LF \ + cmn x4, #1 __LF \ + adcs x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adc x14, x14, x4 __LF \ + cmn x10, #1 __LF \ + eor x11, x11, x10 __LF \ + adcs x11, x11, x15 __LF \ + eor x12, x12, x10 __LF \ + adcs x12, x12, x1 __LF \ + eor x13, x13, x10 __LF \ + adcs x13, x13, x5 __LF \ + eor x14, x14, x10 __LF \ + adcs x14, x14, x9 __LF \ + adcs x3, x2, x10 __LF \ + adcs x4, x10, xzr __LF \ + adc x10, x10, xzr __LF \ + adds x13, x13, x15 __LF \ + adcs x14, x14, x1 __LF \ + adcs x3, x3, x5 __LF \ + adcs x4, x4, x9 __LF \ + adc x10, x10, x2 __LF \ + lsl x16, x11, #32 __LF \ + lsr x15, x11, #32 __LF \ + subs x1, x16, x11 __LF \ + sbc x0, x15, xzr __LF \ + subs x12, x12, x1 __LF \ + sbcs x13, x13, x0 __LF \ + sbcs x14, x14, x16 __LF \ + sbc x11, x11, x15 __LF \ + lsl x16, x12, #32 __LF \ + lsr x15, x12, #32 __LF \ + subs x1, x16, x12 __LF \ + sbc x0, x15, xzr __LF \ + subs x13, x13, x1 __LF \ + sbcs x14, x14, x0 __LF \ + sbcs x11, x11, x16 __LF \ + sbc x12, x12, x15 __LF \ + adds x3, x3, x11 __LF \ + adcs x4, x4, x12 __LF \ + adc x10, x10, xzr __LF \ + add x2, x10, #1 __LF \ + lsl x15, x2, #32 __LF \ + sub x16, x15, x2 __LF \ + adds x13, x13, x2 __LF \ + adcs x14, x14, x16 __LF \ + adcs x3, x3, xzr __LF \ + adcs x4, x4, x15 __LF \ + csetm x7, lo __LF \ + adds x13, x13, x7 __LF \ + and x16, x7, #0xffffffff00000000 __LF \ + adcs x14, x14, x16 __LF \ + adcs x3, x3, x7 __LF \ + and x15, x7, #0xfffffffeffffffff __LF \ + adc x4, x4, x15 __LF \ + stp x13, x14, [P0] __LF \ + stp x3, x4, [P0+16] + +// Corresponds to bignum_montsqr_sm2 with x0 in place of x17 + +#define montsqr_sm2(P0,P1) \ + ldp x2, x3, [P1] __LF \ + ldp x4, x5, [P1+16] __LF \ + umull x15, w2, w2 __LF \ + lsr x11, x2, #32 __LF \ + umull x16, w11, w11 __LF \ + umull x11, w2, w11 __LF \ + adds x15, x15, x11, lsl #33 __LF \ + lsr x11, x11, #31 __LF \ + adc x16, x16, x11 __LF \ + umull x0, w3, w3 __LF \ + lsr x11, x3, #32 __LF \ + umull x1, w11, w11 __LF \ + umull x11, w3, w11 __LF \ + mul x12, x2, x3 __LF \ + umulh x13, x2, x3 __LF \ + adds x0, x0, x11, lsl #33 __LF \ + lsr x11, x11, #31 __LF \ + adc x1, x1, x11 __LF \ + adds x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adc x1, x1, xzr __LF \ + adds x16, x16, x12 __LF \ + adcs x0, x0, x13 __LF \ + adc x1, x1, xzr __LF \ + lsl x12, x15, #32 __LF \ + lsr x11, x15, #32 __LF \ + subs x14, x12, x15 __LF \ + sbc x13, x11, xzr __LF \ + subs x16, x16, x14 __LF \ + sbcs x0, x0, x13 __LF \ + sbcs x1, x1, x12 __LF \ + sbc x15, x15, x11 __LF \ + lsl x12, x16, #32 __LF \ + lsr x11, x16, #32 __LF \ + subs x14, x12, x16 __LF \ + sbc x13, x11, xzr __LF \ + subs x0, x0, x14 __LF \ + sbcs x1, x1, x13 __LF \ + sbcs x15, x15, x12 __LF \ + sbc x16, x16, x11 __LF \ + mul x6, x2, x4 __LF \ + mul x14, x3, x5 __LF \ + umulh x8, x2, x4 __LF \ + subs x10, x2, x3 __LF \ + cneg x10, x10, lo __LF \ + csetm x13, lo __LF \ + subs x12, x5, x4 __LF \ + cneg x12, x12, lo __LF \ + mul x11, x10, x12 __LF \ + umulh x12, x10, x12 __LF \ + cinv x13, x13, lo __LF \ + eor x11, x11, x13 __LF \ + eor x12, x12, x13 __LF \ + adds x7, x6, x8 __LF \ + adc x8, x8, xzr __LF \ + umulh x9, x3, x5 __LF \ + adds x7, x7, x14 __LF \ + adcs x8, x8, x9 __LF \ + adc x9, x9, xzr __LF \ + adds x8, x8, x14 __LF \ + adc x9, x9, xzr __LF \ + cmn x13, #1 __LF \ + adcs x7, x7, x11 __LF \ + adcs x8, x8, x12 __LF \ + adc x9, x9, x13 __LF \ + adds x6, x6, x6 __LF \ + adcs x7, x7, x7 __LF \ + adcs x8, x8, x8 __LF \ + adcs x9, x9, x9 __LF \ + adc x10, xzr, xzr __LF \ + adds x6, x6, x0 __LF \ + adcs x7, x7, x1 __LF \ + adcs x8, x8, x15 __LF \ + adcs x9, x9, x16 __LF \ + adc x10, x10, xzr __LF \ + lsl x12, x6, #32 __LF \ + lsr x11, x6, #32 __LF \ + subs x14, x12, x6 __LF \ + sbc x13, x11, xzr __LF \ + subs x7, x7, x14 __LF \ + sbcs x8, x8, x13 __LF \ + sbcs x9, x9, x12 __LF \ + sbc x14, x6, x11 __LF \ + adds x10, x10, x14 __LF \ + adc x6, xzr, xzr __LF \ + lsl x12, x7, #32 __LF \ + lsr x11, x7, #32 __LF \ + subs x14, x12, x7 __LF \ + sbc x13, x11, xzr __LF \ + subs x8, x8, x14 __LF \ + sbcs x9, x9, x13 __LF \ + sbcs x10, x10, x12 __LF \ + sbc x14, x7, x11 __LF \ + adds x6, x6, x14 __LF \ + adc x7, xzr, xzr __LF \ + mul x11, x4, x4 __LF \ + adds x8, x8, x11 __LF \ + mul x12, x5, x5 __LF \ + umulh x11, x4, x4 __LF \ + adcs x9, x9, x11 __LF \ + adcs x10, x10, x12 __LF \ + umulh x12, x5, x5 __LF \ + adcs x6, x6, x12 __LF \ + adc x7, x7, xzr __LF \ + mul x11, x4, x5 __LF \ + umulh x12, x4, x5 __LF \ + adds x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adc x13, xzr, xzr __LF \ + adds x9, x9, x11 __LF \ + adcs x10, x10, x12 __LF \ + adcs x6, x6, x13 __LF \ + adcs x7, x7, xzr __LF \ + mov x11, #-4294967296 __LF \ + adds x5, x8, #1 __LF \ + sbcs x11, x9, x11 __LF \ + mov x13, #-4294967297 __LF \ + adcs x12, x10, xzr __LF \ + sbcs x13, x6, x13 __LF \ + sbcs xzr, x7, xzr __LF \ + csel x8, x5, x8, hs __LF \ + csel x9, x11, x9, hs __LF \ + csel x10, x12, x10, hs __LF \ + csel x6, x13, x6, hs __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x6, [P0+16] + +// Corresponds exactly to bignum_sub_sm2 + +#define sub_sm2(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + csetm x3, cc __LF \ + adds x5, x5, x3 __LF \ + and x4, x3, #0xffffffff00000000 __LF \ + adcs x6, x6, x4 __LF \ + adcs x7, x7, x3 __LF \ + and x4, x3, #0xfffffffeffffffff __LF \ + adc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +S2N_BN_SYMBOL(sm2_montjadd): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 12 * multiply + 4 * square + 7 * subtract + + montsqr_sm2(z1sq,z_1) + montsqr_sm2(z2sq,z_2) + + montmul_sm2(y1a,z_2,y_1) + montmul_sm2(y2a,z_1,y_2) + + montmul_sm2(x2a,z1sq,x_2) + montmul_sm2(x1a,z2sq,x_1) + montmul_sm2(y2a,z1sq,y2a) + montmul_sm2(y1a,z2sq,y1a) + + sub_sm2(xd,x2a,x1a) + sub_sm2(yd,y2a,y1a) + + montsqr_sm2(zz,xd) + montsqr_sm2(ww,yd) + + montmul_sm2(zzx1,zz,x1a) + montmul_sm2(zzx2,zz,x2a) + + sub_sm2(resx,ww,zzx1) + sub_sm2(t1,zzx2,zzx1) + + montmul_sm2(xd,xd,z_1) + + sub_sm2(resx,resx,zzx2) + + sub_sm2(t2,zzx1,resx) + + montmul_sm2(t1,t1,y1a) + montmul_sm2(resz,xd,z_2) + montmul_sm2(t2,yd,t2) + + sub_sm2(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne + + ldp x4, x5, [z_2] + ldp x6, x7, [z_2+16] + + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne + + cmp x13, x12 + +// Multiplex the outputs accordingly, re-using the z's in registers + + ldp x8, x9, [resz] + csel x8, x0, x8, lo + csel x9, x1, x9, lo + csel x8, x4, x8, hi + csel x9, x5, x9, hi + ldp x10, x11, [resz+16] + csel x10, x2, x10, lo + csel x11, x3, x11, lo + csel x10, x6, x10, hi + csel x11, x7, x11, hi + + ldp x12, x13, [x_1] + ldp x0, x1, [resx] + csel x0, x12, x0, lo + csel x1, x13, x1, lo + ldp x12, x13, [x_2] + csel x0, x12, x0, hi + csel x1, x13, x1, hi + + ldp x12, x13, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x12, x2, lo + csel x3, x13, x3, lo + ldp x12, x13, [x_2+16] + csel x2, x12, x2, hi + csel x3, x13, x3, hi + + ldp x12, x13, [y_1] + ldp x4, x5, [resy] + csel x4, x12, x4, lo + csel x5, x13, x5, lo + ldp x12, x13, [y_2] + csel x4, x12, x4, hi + csel x5, x13, x5, hi + + ldp x12, x13, [y_1+16] + ldp x6, x7, [resy+16] + csel x6, x12, x6, lo + csel x7, x13, x7, lo + ldp x12, x13, [y_2+16] + csel x6, x12, x6, hi + csel x7, x13, x7, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [y_3] + stp x6, x7, [y_3+16] + stp x8, x9, [z_3] + stp x10, x11, [z_3+16] + +// Restore registers and return + + add sp, sp, NSPACE + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjadd_alt.S new file mode 100644 index 00000000000..390c203ffe9 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjadd_alt.S @@ -0,0 +1,548 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates +// +// extern void sm2_montjadd_alt +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x15 +#define input_x x16 +#define input_y x17 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE +#define z_2 input_y, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define x1a sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define z2sq sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define y1a sp, #(NUMSIZE*6) + +#define NSPACE (NUMSIZE*7) + +// Corresponds to bignum_montmul_sm2_alt except for registers + +#define montmul_sm2(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x0, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x1, x3, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x1, x1, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x1, x1, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x0, x0, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x1, x1, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x0, x0, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x1, x1, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x0, x0, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x1, x1, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + lsl x11, x12, #32 __LF \ + lsr x6, x12, #32 __LF \ + subs x8, x11, x12 __LF \ + sbc x7, x6, xzr __LF \ + subs x13, x13, x8 __LF \ + sbcs x14, x14, x7 __LF \ + sbcs x0, x0, x11 __LF \ + sbc x12, x12, x6 __LF \ + lsl x11, x13, #32 __LF \ + lsr x6, x13, #32 __LF \ + subs x8, x11, x13 __LF \ + sbc x7, x6, xzr __LF \ + subs x14, x14, x8 __LF \ + sbcs x0, x0, x7 __LF \ + sbcs x12, x12, x11 __LF \ + sbc x13, x13, x6 __LF \ + lsl x11, x14, #32 __LF \ + lsr x6, x14, #32 __LF \ + subs x8, x11, x14 __LF \ + sbc x7, x6, xzr __LF \ + subs x0, x0, x8 __LF \ + sbcs x12, x12, x7 __LF \ + sbcs x13, x13, x11 __LF \ + sbc x14, x14, x6 __LF \ + lsl x11, x0, #32 __LF \ + lsr x6, x0, #32 __LF \ + subs x8, x11, x0 __LF \ + sbc x7, x6, xzr __LF \ + subs x12, x12, x8 __LF \ + sbcs x13, x13, x7 __LF \ + sbcs x14, x14, x11 __LF \ + sbc x0, x0, x6 __LF \ + adds x12, x12, x1 __LF \ + adcs x13, x13, x3 __LF \ + adcs x14, x14, x4 __LF \ + adcs x0, x0, x5 __LF \ + cset x8, cs __LF \ + mov x11, #0xffffffff00000000 __LF \ + mov x6, #0xfffffffeffffffff __LF \ + adds x1, x12, #0x1 __LF \ + sbcs x3, x13, x11 __LF \ + adcs x4, x14, xzr __LF \ + sbcs x5, x0, x6 __LF \ + sbcs xzr, x8, xzr __LF \ + csel x12, x12, x1, cc __LF \ + csel x13, x13, x3, cc __LF \ + csel x14, x14, x4, cc __LF \ + csel x0, x0, x5, cc __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x0, [P0+16] + +// Corresponds to bignum_montsqr_sm2_alt exactly + +#define montsqr_sm2(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x6, x2, x4 __LF \ + umulh x7, x2, x4 __LF \ + adds x10, x10, x6 __LF \ + adcs x11, x11, x7 __LF \ + mul x6, x3, x4 __LF \ + umulh x7, x3, x4 __LF \ + adc x7, x7, xzr __LF \ + adds x11, x11, x6 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x7 __LF \ + mul x6, x3, x5 __LF \ + umulh x7, x3, x5 __LF \ + adc x7, x7, xzr __LF \ + adds x12, x12, x6 __LF \ + adcs x13, x13, x7 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x7, cs __LF \ + umulh x6, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x6 __LF \ + mul x6, x3, x3 __LF \ + adcs x10, x10, x6 __LF \ + umulh x6, x3, x3 __LF \ + adcs x11, x11, x6 __LF \ + mul x6, x4, x4 __LF \ + adcs x12, x12, x6 __LF \ + umulh x6, x4, x4 __LF \ + adcs x13, x13, x6 __LF \ + mul x6, x5, x5 __LF \ + adcs x14, x14, x6 __LF \ + umulh x6, x5, x5 __LF \ + adc x7, x7, x6 __LF \ + lsl x4, x8, #32 __LF \ + lsr x5, x8, #32 __LF \ + subs x2, x4, x8 __LF \ + sbc x3, x5, xzr __LF \ + subs x9, x9, x2 __LF \ + sbcs x10, x10, x3 __LF \ + sbcs x11, x11, x4 __LF \ + sbc x8, x8, x5 __LF \ + lsl x4, x9, #32 __LF \ + lsr x5, x9, #32 __LF \ + subs x2, x4, x9 __LF \ + sbc x3, x5, xzr __LF \ + subs x10, x10, x2 __LF \ + sbcs x11, x11, x3 __LF \ + sbcs x8, x8, x4 __LF \ + sbc x9, x9, x5 __LF \ + lsl x4, x10, #32 __LF \ + lsr x5, x10, #32 __LF \ + subs x2, x4, x10 __LF \ + sbc x3, x5, xzr __LF \ + subs x11, x11, x2 __LF \ + sbcs x8, x8, x3 __LF \ + sbcs x9, x9, x4 __LF \ + sbc x10, x10, x5 __LF \ + lsl x4, x11, #32 __LF \ + lsr x5, x11, #32 __LF \ + subs x2, x4, x11 __LF \ + sbc x3, x5, xzr __LF \ + subs x8, x8, x2 __LF \ + sbcs x9, x9, x3 __LF \ + sbcs x10, x10, x4 __LF \ + sbc x11, x11, x5 __LF \ + adds x8, x8, x12 __LF \ + adcs x9, x9, x13 __LF \ + adcs x10, x10, x14 __LF \ + adcs x11, x11, x7 __LF \ + cset x2, cs __LF \ + mov x3, #0xffffffff00000000 __LF \ + mov x5, #0xfffffffeffffffff __LF \ + adds x12, x8, #0x1 __LF \ + sbcs x13, x9, x3 __LF \ + adcs x14, x10, xzr __LF \ + sbcs x7, x11, x5 __LF \ + sbcs xzr, x2, xzr __LF \ + csel x8, x8, x12, cc __LF \ + csel x9, x9, x13, cc __LF \ + csel x10, x10, x14, cc __LF \ + csel x11, x11, x7, cc __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). + +#define amontsqr_sm2(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x6, x2, x4 __LF \ + umulh x7, x2, x4 __LF \ + adds x10, x10, x6 __LF \ + adcs x11, x11, x7 __LF \ + mul x6, x3, x4 __LF \ + umulh x7, x3, x4 __LF \ + adc x7, x7, xzr __LF \ + adds x11, x11, x6 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x7 __LF \ + mul x6, x3, x5 __LF \ + umulh x7, x3, x5 __LF \ + adc x7, x7, xzr __LF \ + adds x12, x12, x6 __LF \ + adcs x13, x13, x7 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x7, cs __LF \ + umulh x6, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x6 __LF \ + mul x6, x3, x3 __LF \ + adcs x10, x10, x6 __LF \ + umulh x6, x3, x3 __LF \ + adcs x11, x11, x6 __LF \ + mul x6, x4, x4 __LF \ + adcs x12, x12, x6 __LF \ + umulh x6, x4, x4 __LF \ + adcs x13, x13, x6 __LF \ + mul x6, x5, x5 __LF \ + adcs x14, x14, x6 __LF \ + umulh x6, x5, x5 __LF \ + adc x7, x7, x6 __LF \ + lsl x4, x8, #32 __LF \ + lsr x5, x8, #32 __LF \ + subs x2, x4, x8 __LF \ + sbc x3, x5, xzr __LF \ + subs x9, x9, x2 __LF \ + sbcs x10, x10, x3 __LF \ + sbcs x11, x11, x4 __LF \ + sbc x8, x8, x5 __LF \ + lsl x4, x9, #32 __LF \ + lsr x5, x9, #32 __LF \ + subs x2, x4, x9 __LF \ + sbc x3, x5, xzr __LF \ + subs x10, x10, x2 __LF \ + sbcs x11, x11, x3 __LF \ + sbcs x8, x8, x4 __LF \ + sbc x9, x9, x5 __LF \ + lsl x4, x10, #32 __LF \ + lsr x5, x10, #32 __LF \ + subs x2, x4, x10 __LF \ + sbc x3, x5, xzr __LF \ + subs x11, x11, x2 __LF \ + sbcs x8, x8, x3 __LF \ + sbcs x9, x9, x4 __LF \ + sbc x10, x10, x5 __LF \ + lsl x4, x11, #32 __LF \ + lsr x5, x11, #32 __LF \ + subs x2, x4, x11 __LF \ + sbc x3, x5, xzr __LF \ + subs x8, x8, x2 __LF \ + sbcs x9, x9, x3 __LF \ + sbcs x10, x10, x4 __LF \ + sbc x11, x11, x5 __LF \ + adds x8, x8, x12 __LF \ + adcs x9, x9, x13 __LF \ + adcs x10, x10, x14 __LF \ + adcs x11, x11, x7 __LF \ + csetm x2, cs __LF \ + subs x8, x8, x2 __LF \ + and x3, x2, #0xffffffff00000000 __LF \ + sbcs x9, x9, x3 __LF \ + and x5, x2, #0xfffffffeffffffff __LF \ + sbcs x10, x10, x2 __LF \ + sbc x11, x11, x5 __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Corresponds exactly to bignum_sub_sm2 + +#define sub_sm2(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + csetm x3, cc __LF \ + adds x5, x5, x3 __LF \ + and x4, x3, #0xffffffff00000000 __LF \ + adcs x6, x6, x4 __LF \ + adcs x7, x7, x3 __LF \ + and x4, x3, #0xfffffffeffffffff __LF \ + adc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +S2N_BN_SYMBOL(sm2_montjadd_alt): + +// Make room on stack for temporary variables +// Move the input arguments to stable places + + sub sp, sp, NSPACE + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 12 * multiply + 4 * square + 7 * subtract + + amontsqr_sm2(z1sq,z_1) + amontsqr_sm2(z2sq,z_2) + + montmul_sm2(y1a,z_2,y_1) + montmul_sm2(y2a,z_1,y_2) + + montmul_sm2(x2a,z1sq,x_2) + montmul_sm2(x1a,z2sq,x_1) + montmul_sm2(y2a,z1sq,y2a) + montmul_sm2(y1a,z2sq,y1a) + + sub_sm2(xd,x2a,x1a) + sub_sm2(yd,y2a,y1a) + + amontsqr_sm2(zz,xd) + montsqr_sm2(ww,yd) + + montmul_sm2(zzx1,zz,x1a) + montmul_sm2(zzx2,zz,x2a) + + sub_sm2(resx,ww,zzx1) + sub_sm2(t1,zzx2,zzx1) + + montmul_sm2(xd,xd,z_1) + + sub_sm2(resx,resx,zzx2) + + sub_sm2(t2,zzx1,resx) + + montmul_sm2(t1,t1,y1a) + montmul_sm2(resz,xd,z_2) + montmul_sm2(t2,yd,t2) + + sub_sm2(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "HI" <=> CF /\ ~ZF <=> P1 = 0 /\ ~(P2 = 0) +// and "LO" <=> ~CF <=> ~(P1 = 0) /\ P2 = 0 + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne + + ldp x4, x5, [z_2] + ldp x6, x7, [z_2+16] + + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne + + cmp x13, x12 + +// Multiplex the outputs accordingly, re-using the z's in registers + + ldp x8, x9, [resz] + csel x8, x0, x8, lo + csel x9, x1, x9, lo + csel x8, x4, x8, hi + csel x9, x5, x9, hi + ldp x10, x11, [resz+16] + csel x10, x2, x10, lo + csel x11, x3, x11, lo + csel x10, x6, x10, hi + csel x11, x7, x11, hi + + ldp x12, x13, [x_1] + ldp x0, x1, [resx] + csel x0, x12, x0, lo + csel x1, x13, x1, lo + ldp x12, x13, [x_2] + csel x0, x12, x0, hi + csel x1, x13, x1, hi + + ldp x12, x13, [x_1+16] + ldp x2, x3, [resx+16] + csel x2, x12, x2, lo + csel x3, x13, x3, lo + ldp x12, x13, [x_2+16] + csel x2, x12, x2, hi + csel x3, x13, x3, hi + + ldp x12, x13, [y_1] + ldp x4, x5, [resy] + csel x4, x12, x4, lo + csel x5, x13, x5, lo + ldp x12, x13, [y_2] + csel x4, x12, x4, hi + csel x5, x13, x5, hi + + ldp x12, x13, [y_1+16] + ldp x6, x7, [resy+16] + csel x6, x12, x6, lo + csel x7, x13, x7, lo + ldp x12, x13, [y_2+16] + csel x6, x12, x6, hi + csel x7, x13, x7, hi + +// Finally store back the multiplexed values + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [y_3] + stp x6, x7, [y_3+16] + stp x8, x9, [z_3] + stp x10, x11, [z_3+16] + +// Restore stack and return + + add sp, sp, NSPACE + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjdouble.S new file mode 100644 index 00000000000..e878d939cc1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjdouble.S @@ -0,0 +1,663 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates +// +// extern void sm2_montjdouble +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjdouble) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x19 +#define input_x x20 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 sp, #(NUMSIZE*0) +#define y4 sp, #(NUMSIZE*0) + +#define y2 sp, #(NUMSIZE*1) + +#define t1 sp, #(NUMSIZE*2) + +#define t2 sp, #(NUMSIZE*3) +#define x2p sp, #(NUMSIZE*3) +#define dx2 sp, #(NUMSIZE*3) + +#define xy2 sp, #(NUMSIZE*4) + +#define x4p sp, #(NUMSIZE*5) +#define d_ sp, #(NUMSIZE*5) + +#define NSPACE #(NUMSIZE*6) + +// Corresponds to bignum_montmul_sm2 exactly + +#define montmul_sm2(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2] __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x7 __LF \ + mul x13, x4, x8 __LF \ + umulh x12, x3, x7 __LF \ + adds x16, x11, x13 __LF \ + umulh x14, x4, x8 __LF \ + adcs x17, x12, x14 __LF \ + adcs x14, x14, xzr __LF \ + adds x12, x12, x16 __LF \ + adcs x13, x13, x17 __LF \ + adcs x14, x14, xzr __LF \ + subs x15, x3, x4 __LF \ + cneg x15, x15, lo __LF \ + csetm x1, lo __LF \ + subs x17, x8, x7 __LF \ + cneg x17, x17, lo __LF \ + mul x16, x15, x17 __LF \ + umulh x17, x15, x17 __LF \ + cinv x1, x1, lo __LF \ + eor x16, x16, x1 __LF \ + eor x17, x17, x1 __LF \ + cmn x1, #1 __LF \ + adcs x12, x12, x16 __LF \ + adcs x13, x13, x17 __LF \ + adc x14, x14, x1 __LF \ + lsl x16, x11, #32 __LF \ + lsr x15, x11, #32 __LF \ + subs x1, x16, x11 __LF \ + sbc x17, x15, xzr __LF \ + subs x12, x12, x1 __LF \ + sbcs x13, x13, x17 __LF \ + sbcs x14, x14, x16 __LF \ + sbc x11, x11, x15 __LF \ + lsl x16, x12, #32 __LF \ + lsr x15, x12, #32 __LF \ + subs x1, x16, x12 __LF \ + sbc x17, x15, xzr __LF \ + subs x13, x13, x1 __LF \ + sbcs x14, x14, x17 __LF \ + sbcs x11, x11, x16 __LF \ + sbc x12, x12, x15 __LF \ + stp x13, x14, [P0] __LF \ + stp x11, x12, [P0+16] __LF \ + mul x11, x5, x9 __LF \ + mul x13, x6, x10 __LF \ + umulh x12, x5, x9 __LF \ + adds x16, x11, x13 __LF \ + umulh x14, x6, x10 __LF \ + adcs x17, x12, x14 __LF \ + adcs x14, x14, xzr __LF \ + adds x12, x12, x16 __LF \ + adcs x13, x13, x17 __LF \ + adcs x14, x14, xzr __LF \ + subs x15, x5, x6 __LF \ + cneg x15, x15, lo __LF \ + csetm x1, lo __LF \ + subs x17, x10, x9 __LF \ + cneg x17, x17, lo __LF \ + mul x16, x15, x17 __LF \ + umulh x17, x15, x17 __LF \ + cinv x1, x1, lo __LF \ + eor x16, x16, x1 __LF \ + eor x17, x17, x1 __LF \ + cmn x1, #1 __LF \ + adcs x12, x12, x16 __LF \ + adcs x13, x13, x17 __LF \ + adc x14, x14, x1 __LF \ + subs x3, x5, x3 __LF \ + sbcs x4, x6, x4 __LF \ + ngc x5, xzr __LF \ + cmn x5, #1 __LF \ + eor x3, x3, x5 __LF \ + adcs x3, x3, xzr __LF \ + eor x4, x4, x5 __LF \ + adcs x4, x4, xzr __LF \ + subs x7, x7, x9 __LF \ + sbcs x8, x8, x10 __LF \ + ngc x9, xzr __LF \ + cmn x9, #1 __LF \ + eor x7, x7, x9 __LF \ + adcs x7, x7, xzr __LF \ + eor x8, x8, x9 __LF \ + adcs x8, x8, xzr __LF \ + eor x10, x5, x9 __LF \ + ldp x15, x1, [P0] __LF \ + adds x15, x11, x15 __LF \ + adcs x1, x12, x1 __LF \ + ldp x5, x9, [P0+16] __LF \ + adcs x5, x13, x5 __LF \ + adcs x9, x14, x9 __LF \ + adc x2, xzr, xzr __LF \ + mul x11, x3, x7 __LF \ + mul x13, x4, x8 __LF \ + umulh x12, x3, x7 __LF \ + adds x16, x11, x13 __LF \ + umulh x14, x4, x8 __LF \ + adcs x17, x12, x14 __LF \ + adcs x14, x14, xzr __LF \ + adds x12, x12, x16 __LF \ + adcs x13, x13, x17 __LF \ + adcs x14, x14, xzr __LF \ + subs x3, x3, x4 __LF \ + cneg x3, x3, lo __LF \ + csetm x4, lo __LF \ + subs x17, x8, x7 __LF \ + cneg x17, x17, lo __LF \ + mul x16, x3, x17 __LF \ + umulh x17, x3, x17 __LF \ + cinv x4, x4, lo __LF \ + eor x16, x16, x4 __LF \ + eor x17, x17, x4 __LF \ + cmn x4, #1 __LF \ + adcs x12, x12, x16 __LF \ + adcs x13, x13, x17 __LF \ + adc x14, x14, x4 __LF \ + cmn x10, #1 __LF \ + eor x11, x11, x10 __LF \ + adcs x11, x11, x15 __LF \ + eor x12, x12, x10 __LF \ + adcs x12, x12, x1 __LF \ + eor x13, x13, x10 __LF \ + adcs x13, x13, x5 __LF \ + eor x14, x14, x10 __LF \ + adcs x14, x14, x9 __LF \ + adcs x3, x2, x10 __LF \ + adcs x4, x10, xzr __LF \ + adc x10, x10, xzr __LF \ + adds x13, x13, x15 __LF \ + adcs x14, x14, x1 __LF \ + adcs x3, x3, x5 __LF \ + adcs x4, x4, x9 __LF \ + adc x10, x10, x2 __LF \ + lsl x16, x11, #32 __LF \ + lsr x15, x11, #32 __LF \ + subs x1, x16, x11 __LF \ + sbc x17, x15, xzr __LF \ + subs x12, x12, x1 __LF \ + sbcs x13, x13, x17 __LF \ + sbcs x14, x14, x16 __LF \ + sbc x11, x11, x15 __LF \ + lsl x16, x12, #32 __LF \ + lsr x15, x12, #32 __LF \ + subs x1, x16, x12 __LF \ + sbc x17, x15, xzr __LF \ + subs x13, x13, x1 __LF \ + sbcs x14, x14, x17 __LF \ + sbcs x11, x11, x16 __LF \ + sbc x12, x12, x15 __LF \ + adds x3, x3, x11 __LF \ + adcs x4, x4, x12 __LF \ + adc x10, x10, xzr __LF \ + add x2, x10, #1 __LF \ + lsl x15, x2, #32 __LF \ + sub x16, x15, x2 __LF \ + adds x13, x13, x2 __LF \ + adcs x14, x14, x16 __LF \ + adcs x3, x3, xzr __LF \ + adcs x4, x4, x15 __LF \ + csetm x7, lo __LF \ + adds x13, x13, x7 __LF \ + and x16, x7, #0xffffffff00000000 __LF \ + adcs x14, x14, x16 __LF \ + adcs x3, x3, x7 __LF \ + and x15, x7, #0xfffffffeffffffff __LF \ + adc x4, x4, x15 __LF \ + stp x13, x14, [P0] __LF \ + stp x3, x4, [P0+16] + +// Corresponds to bignum_montsqr_sm2 exactly + +#define montsqr_sm2(P0,P1) \ + ldp x2, x3, [P1] __LF \ + ldp x4, x5, [P1+16] __LF \ + umull x15, w2, w2 __LF \ + lsr x11, x2, #32 __LF \ + umull x16, w11, w11 __LF \ + umull x11, w2, w11 __LF \ + adds x15, x15, x11, lsl #33 __LF \ + lsr x11, x11, #31 __LF \ + adc x16, x16, x11 __LF \ + umull x17, w3, w3 __LF \ + lsr x11, x3, #32 __LF \ + umull x1, w11, w11 __LF \ + umull x11, w3, w11 __LF \ + mul x12, x2, x3 __LF \ + umulh x13, x2, x3 __LF \ + adds x17, x17, x11, lsl #33 __LF \ + lsr x11, x11, #31 __LF \ + adc x1, x1, x11 __LF \ + adds x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adc x1, x1, xzr __LF \ + adds x16, x16, x12 __LF \ + adcs x17, x17, x13 __LF \ + adc x1, x1, xzr __LF \ + lsl x12, x15, #32 __LF \ + lsr x11, x15, #32 __LF \ + subs x14, x12, x15 __LF \ + sbc x13, x11, xzr __LF \ + subs x16, x16, x14 __LF \ + sbcs x17, x17, x13 __LF \ + sbcs x1, x1, x12 __LF \ + sbc x15, x15, x11 __LF \ + lsl x12, x16, #32 __LF \ + lsr x11, x16, #32 __LF \ + subs x14, x12, x16 __LF \ + sbc x13, x11, xzr __LF \ + subs x17, x17, x14 __LF \ + sbcs x1, x1, x13 __LF \ + sbcs x15, x15, x12 __LF \ + sbc x16, x16, x11 __LF \ + mul x6, x2, x4 __LF \ + mul x14, x3, x5 __LF \ + umulh x8, x2, x4 __LF \ + subs x10, x2, x3 __LF \ + cneg x10, x10, lo __LF \ + csetm x13, lo __LF \ + subs x12, x5, x4 __LF \ + cneg x12, x12, lo __LF \ + mul x11, x10, x12 __LF \ + umulh x12, x10, x12 __LF \ + cinv x13, x13, lo __LF \ + eor x11, x11, x13 __LF \ + eor x12, x12, x13 __LF \ + adds x7, x6, x8 __LF \ + adc x8, x8, xzr __LF \ + umulh x9, x3, x5 __LF \ + adds x7, x7, x14 __LF \ + adcs x8, x8, x9 __LF \ + adc x9, x9, xzr __LF \ + adds x8, x8, x14 __LF \ + adc x9, x9, xzr __LF \ + cmn x13, #1 __LF \ + adcs x7, x7, x11 __LF \ + adcs x8, x8, x12 __LF \ + adc x9, x9, x13 __LF \ + adds x6, x6, x6 __LF \ + adcs x7, x7, x7 __LF \ + adcs x8, x8, x8 __LF \ + adcs x9, x9, x9 __LF \ + adc x10, xzr, xzr __LF \ + adds x6, x6, x17 __LF \ + adcs x7, x7, x1 __LF \ + adcs x8, x8, x15 __LF \ + adcs x9, x9, x16 __LF \ + adc x10, x10, xzr __LF \ + lsl x12, x6, #32 __LF \ + lsr x11, x6, #32 __LF \ + subs x14, x12, x6 __LF \ + sbc x13, x11, xzr __LF \ + subs x7, x7, x14 __LF \ + sbcs x8, x8, x13 __LF \ + sbcs x9, x9, x12 __LF \ + sbc x14, x6, x11 __LF \ + adds x10, x10, x14 __LF \ + adc x6, xzr, xzr __LF \ + lsl x12, x7, #32 __LF \ + lsr x11, x7, #32 __LF \ + subs x14, x12, x7 __LF \ + sbc x13, x11, xzr __LF \ + subs x8, x8, x14 __LF \ + sbcs x9, x9, x13 __LF \ + sbcs x10, x10, x12 __LF \ + sbc x14, x7, x11 __LF \ + adds x6, x6, x14 __LF \ + adc x7, xzr, xzr __LF \ + mul x11, x4, x4 __LF \ + adds x8, x8, x11 __LF \ + mul x12, x5, x5 __LF \ + umulh x11, x4, x4 __LF \ + adcs x9, x9, x11 __LF \ + adcs x10, x10, x12 __LF \ + umulh x12, x5, x5 __LF \ + adcs x6, x6, x12 __LF \ + adc x7, x7, xzr __LF \ + mul x11, x4, x5 __LF \ + umulh x12, x4, x5 __LF \ + adds x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adc x13, xzr, xzr __LF \ + adds x9, x9, x11 __LF \ + adcs x10, x10, x12 __LF \ + adcs x6, x6, x13 __LF \ + adcs x7, x7, xzr __LF \ + mov x11, #-4294967296 __LF \ + adds x5, x8, #1 __LF \ + sbcs x11, x9, x11 __LF \ + mov x13, #-4294967297 __LF \ + adcs x12, x10, xzr __LF \ + sbcs x13, x6, x13 __LF \ + sbcs xzr, x7, xzr __LF \ + csel x8, x5, x8, hs __LF \ + csel x9, x11, x9, hs __LF \ + csel x10, x12, x10, hs __LF \ + csel x6, x13, x6, hs __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x6, [P0+16] + +// Corresponds exactly to bignum_sub_sm2 + +#define sub_sm2(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + csetm x3, cc __LF \ + adds x5, x5, x3 __LF \ + and x4, x3, #0xffffffff00000000 __LF \ + adcs x6, x6, x4 __LF \ + adcs x7, x7, x3 __LF \ + and x4, x3, #0xfffffffeffffffff __LF \ + adc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Corresponds exactly to bignum_add_sm2 + +#define add_sm2(P0,P1,P2) \ + ldp x4, x5, [P1] __LF \ + ldp x8, x9, [P2] __LF \ + adds x4, x4, x8 __LF \ + adcs x5, x5, x9 __LF \ + ldp x6, x7, [P1+16] __LF \ + ldp x10, x11, [P2+16] __LF \ + adcs x6, x6, x10 __LF \ + adcs x7, x7, x11 __LF \ + adc x3, xzr, xzr __LF \ + adds x8, x4, #0x1 __LF \ + mov x9, #0xffffffff00000000 __LF \ + sbcs x9, x5, x9 __LF \ + adcs x10, x6, xzr __LF \ + mov x11, #0xfffffffeffffffff __LF \ + sbcs x11, x7, x11 __LF \ + sbcs x3, x3, xzr __LF \ + csel x4, x4, x8, cc __LF \ + csel x5, x5, x9, cc __LF \ + csel x6, x6, x10, cc __LF \ + csel x7, x7, x11, cc __LF \ + stp x4, x5, [P0] __LF \ + stp x6, x7, [P0+16] + +// A weak version of add that only guarantees sum in 4 digits + +#define weakadd_sm2(P0,P1,P2) \ + ldp x4, x5, [P1] __LF \ + ldp x8, x9, [P2] __LF \ + adds x4, x4, x8 __LF \ + adcs x5, x5, x9 __LF \ + ldp x6, x7, [P1+16] __LF \ + ldp x10, x11, [P2+16] __LF \ + adcs x6, x6, x10 __LF \ + adcs x7, x7, x11 __LF \ + csetm x2, cs __LF \ + subs x4, x4, x2 __LF \ + and x3, x2, #0xffffffff00000000 __LF \ + sbcs x5, x5, x3 __LF \ + and x1, x2, #0xfffffffeffffffff __LF \ + sbcs x6, x6, x2 __LF \ + sbc x7, x7, x1 __LF \ + stp x4, x5, [P0] __LF \ + stp x6, x7, [P0+16] + +// P0 = C * P1 - D * P2 computed as D * (p_sm2 - P2) + C * P1 +// Quotient estimation is done just as q = h + 1 as in bignum_triple_sm2 +// This also applies to the other functions following. + +#define cmsub_sm2(P0,C,P1,D,P2) \ + mov x1, D __LF \ + mov x2, #-1 __LF \ + ldp x9, x10, [P2] __LF \ + subs x9, x2, x9 __LF \ + mov x3, #0xffffffff00000000 __LF \ + sbcs x10, x3, x10 __LF \ + ldp x11, x12, [P2+16] __LF \ + sbcs x11, x2, x11 __LF \ + mov x4, #0xfffffffeffffffff __LF \ + sbc x12, x4, x12 __LF \ + mul x3, x1, x9 __LF \ + mul x4, x1, x10 __LF \ + mul x5, x1, x11 __LF \ + mul x6, x1, x12 __LF \ + umulh x9, x1, x9 __LF \ + umulh x10, x1, x10 __LF \ + umulh x11, x1, x11 __LF \ + umulh x7, x1, x12 __LF \ + adds x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + adcs x6, x6, x11 __LF \ + adc x7, x7, xzr __LF \ + mov x1, C __LF \ + ldp x9, x10, [P1] __LF \ + mul x8, x9, x1 __LF \ + umulh x9, x9, x1 __LF \ + adds x3, x3, x8 __LF \ + mul x8, x10, x1 __LF \ + umulh x10, x10, x1 __LF \ + adcs x4, x4, x8 __LF \ + ldp x11, x12, [P1+16] __LF \ + mul x8, x11, x1 __LF \ + umulh x11, x11, x1 __LF \ + adcs x5, x5, x8 __LF \ + mul x8, x12, x1 __LF \ + umulh x12, x12, x1 __LF \ + adcs x6, x6, x8 __LF \ + adc x7, x7, xzr __LF \ + adds x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + adcs x6, x6, x11 __LF \ + adc x7, x7, x12 __LF \ + add x7, x7, #0x1 __LF \ + lsl x8, x7, #32 __LF \ + sub x9, x8, x7 __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, xzr __LF \ + adcs x6, x6, x8 __LF \ + csetm x7, cc __LF \ + adds x3, x3, x7 __LF \ + and x9, x7, #0xffffffff00000000 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, x7 __LF \ + and x8, x7, #0xfffffffeffffffff __LF \ + adc x6, x6, x8 __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +// P0 = 4 * P1 - P2, by direct subtraction of P2; the method +// in bignum_cmul_sm2 etc. for quotient estimation still +// works when the value to be reduced is negative, as +// long as it is > -p_sm2, which is the case here. + +#define cmsub41_sm2(P0,P1,P2) \ + ldp x1, x2, [P1] __LF \ + lsl x0, x1, #2 __LF \ + ldp x6, x7, [P2] __LF \ + subs x0, x0, x6 __LF \ + extr x1, x2, x1, #62 __LF \ + sbcs x1, x1, x7 __LF \ + ldp x3, x4, [P1+16] __LF \ + extr x2, x3, x2, #62 __LF \ + ldp x6, x7, [P2+16] __LF \ + sbcs x2, x2, x6 __LF \ + extr x3, x4, x3, #62 __LF \ + sbcs x3, x3, x7 __LF \ + lsr x4, x4, #62 __LF \ + sbc x4, x4, xzr __LF \ + add x4, x4, #0x1 __LF \ + lsl x5, x4, #32 __LF \ + sub x6, x5, x4 __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x6 __LF \ + adcs x2, x2, xzr __LF \ + adcs x3, x3, x5 __LF \ + csetm x4, cc __LF \ + adds x0, x0, x4 __LF \ + and x6, x4, #0xffffffff00000000 __LF \ + adcs x1, x1, x6 __LF \ + adcs x2, x2, x4 __LF \ + and x5, x4, #0xfffffffeffffffff __LF \ + adc x3, x3, x5 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +// P0 = 3 * P1 - 8 * P2, computed as (p_sm2 - P2) << 3 + 3 * P1 + +#define cmsub38_sm2(P0,P1,P2) \ + mov x1, 8 __LF \ + mov x2, #-1 __LF \ + ldp x9, x10, [P2] __LF \ + subs x9, x2, x9 __LF \ + mov x3, #0xffffffff00000000 __LF \ + sbcs x10, x3, x10 __LF \ + ldp x11, x12, [P2+16] __LF \ + sbcs x11, x2, x11 __LF \ + mov x4, #0xfffffffeffffffff __LF \ + sbc x12, x4, x12 __LF \ + lsl x3, x9, #3 __LF \ + extr x4, x10, x9, #61 __LF \ + extr x5, x11, x10, #61 __LF \ + extr x6, x12, x11, #61 __LF \ + lsr x7, x12, #61 __LF \ + mov x1, 3 __LF \ + ldp x9, x10, [P1] __LF \ + mul x8, x9, x1 __LF \ + umulh x9, x9, x1 __LF \ + adds x3, x3, x8 __LF \ + mul x8, x10, x1 __LF \ + umulh x10, x10, x1 __LF \ + adcs x4, x4, x8 __LF \ + ldp x11, x12, [P1+16] __LF \ + mul x8, x11, x1 __LF \ + umulh x11, x11, x1 __LF \ + adcs x5, x5, x8 __LF \ + mul x8, x12, x1 __LF \ + umulh x12, x12, x1 __LF \ + adcs x6, x6, x8 __LF \ + adc x7, x7, xzr __LF \ + adds x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + adcs x6, x6, x11 __LF \ + adc x7, x7, x12 __LF \ + add x7, x7, #0x1 __LF \ + lsl x8, x7, #32 __LF \ + sub x9, x8, x7 __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, xzr __LF \ + adcs x6, x6, x8 __LF \ + csetm x7, cc __LF \ + adds x3, x3, x7 __LF \ + and x9, x7, #0xffffffff00000000 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, x7 __LF \ + and x8, x7, #0xfffffffeffffffff __LF \ + adc x6, x6, x8 __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +S2N_BN_SYMBOL(sm2_montjdouble): + +// Save registers and make room on stack for temporary variables + + sub sp, sp, NSPACE+16 + stp x19, x20, [sp, NSPACE] + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_sm2(z2,z_1) + montsqr_sm2(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + sub_sm2(t2,x_1,z2) + weakadd_sm2(t1,x_1,z2) + montmul_sm2(x2p,t1,t2) + +// t1 = y + z +// xy2 = x * y^2 +// x4p = x2p^2 + + add_sm2(t1,y_1,z_1) + montmul_sm2(xy2,x_1,y2) + montsqr_sm2(x4p,x2p) + +// t1 = (y + z)^2 + + montsqr_sm2(t1,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_sm2(d_,12,xy2,9,x4p) + sub_sm2(t1,t1,z2) + +// y4 = y^4 + + montsqr_sm2(y4,y2) + +// dx2 = d * x2p + + montmul_sm2(dx2,d_,x2p) + +// z_3' = 2 * y * z + + sub_sm2(z_3,t1,y2) + +// x' = 4 * xy2 - d + + cmsub41_sm2(x_3,xy2,d_) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_sm2(y_3,dx2,y4) + +// Restore registers and stack and return + + ldp x19, x20, [sp, NSPACE] + add sp, sp, NSPACE+16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjdouble_alt.S new file mode 100644 index 00000000000..4d29b945fcf --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjdouble_alt.S @@ -0,0 +1,577 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates +// +// extern void sm2_montjdouble_alt +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard ARM ABI: X0 = p3, X1 = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjdouble_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x15 +#define input_x x16 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 sp, #(NUMSIZE*0) +#define y4 sp, #(NUMSIZE*0) + +#define y2 sp, #(NUMSIZE*1) + +#define t1 sp, #(NUMSIZE*2) + +#define t2 sp, #(NUMSIZE*3) +#define x2p sp, #(NUMSIZE*3) +#define dx2 sp, #(NUMSIZE*3) + +#define xy2 sp, #(NUMSIZE*4) + +#define x4p sp, #(NUMSIZE*5) +#define d sp, #(NUMSIZE*5) + +#define NSPACE #(NUMSIZE*6) + +// Corresponds to bignum_montmul_sm2_alt except for registers + +#define montmul_sm2(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x0, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x1, x3, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x1, x1, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x1, x1, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x0, x0, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x1, x1, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x0, x0, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x1, x1, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x0, x0, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x1, x1, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + lsl x11, x12, #32 __LF \ + lsr x6, x12, #32 __LF \ + subs x8, x11, x12 __LF \ + sbc x7, x6, xzr __LF \ + subs x13, x13, x8 __LF \ + sbcs x14, x14, x7 __LF \ + sbcs x0, x0, x11 __LF \ + sbc x12, x12, x6 __LF \ + lsl x11, x13, #32 __LF \ + lsr x6, x13, #32 __LF \ + subs x8, x11, x13 __LF \ + sbc x7, x6, xzr __LF \ + subs x14, x14, x8 __LF \ + sbcs x0, x0, x7 __LF \ + sbcs x12, x12, x11 __LF \ + sbc x13, x13, x6 __LF \ + lsl x11, x14, #32 __LF \ + lsr x6, x14, #32 __LF \ + subs x8, x11, x14 __LF \ + sbc x7, x6, xzr __LF \ + subs x0, x0, x8 __LF \ + sbcs x12, x12, x7 __LF \ + sbcs x13, x13, x11 __LF \ + sbc x14, x14, x6 __LF \ + lsl x11, x0, #32 __LF \ + lsr x6, x0, #32 __LF \ + subs x8, x11, x0 __LF \ + sbc x7, x6, xzr __LF \ + subs x12, x12, x8 __LF \ + sbcs x13, x13, x7 __LF \ + sbcs x14, x14, x11 __LF \ + sbc x0, x0, x6 __LF \ + adds x12, x12, x1 __LF \ + adcs x13, x13, x3 __LF \ + adcs x14, x14, x4 __LF \ + adcs x0, x0, x5 __LF \ + cset x8, cs __LF \ + mov x11, #0xffffffff00000000 __LF \ + mov x6, #0xfffffffeffffffff __LF \ + adds x1, x12, #0x1 __LF \ + sbcs x3, x13, x11 __LF \ + adcs x4, x14, xzr __LF \ + sbcs x5, x0, x6 __LF \ + sbcs xzr, x8, xzr __LF \ + csel x12, x12, x1, cc __LF \ + csel x13, x13, x3, cc __LF \ + csel x14, x14, x4, cc __LF \ + csel x0, x0, x5, cc __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x0, [P0+16] + +// Corresponds to bignum_montsqr_sm2_alt exactly + +#define montsqr_sm2(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x6, x2, x4 __LF \ + umulh x7, x2, x4 __LF \ + adds x10, x10, x6 __LF \ + adcs x11, x11, x7 __LF \ + mul x6, x3, x4 __LF \ + umulh x7, x3, x4 __LF \ + adc x7, x7, xzr __LF \ + adds x11, x11, x6 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x7 __LF \ + mul x6, x3, x5 __LF \ + umulh x7, x3, x5 __LF \ + adc x7, x7, xzr __LF \ + adds x12, x12, x6 __LF \ + adcs x13, x13, x7 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x7, cs __LF \ + umulh x6, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x6 __LF \ + mul x6, x3, x3 __LF \ + adcs x10, x10, x6 __LF \ + umulh x6, x3, x3 __LF \ + adcs x11, x11, x6 __LF \ + mul x6, x4, x4 __LF \ + adcs x12, x12, x6 __LF \ + umulh x6, x4, x4 __LF \ + adcs x13, x13, x6 __LF \ + mul x6, x5, x5 __LF \ + adcs x14, x14, x6 __LF \ + umulh x6, x5, x5 __LF \ + adc x7, x7, x6 __LF \ + lsl x4, x8, #32 __LF \ + lsr x5, x8, #32 __LF \ + subs x2, x4, x8 __LF \ + sbc x3, x5, xzr __LF \ + subs x9, x9, x2 __LF \ + sbcs x10, x10, x3 __LF \ + sbcs x11, x11, x4 __LF \ + sbc x8, x8, x5 __LF \ + lsl x4, x9, #32 __LF \ + lsr x5, x9, #32 __LF \ + subs x2, x4, x9 __LF \ + sbc x3, x5, xzr __LF \ + subs x10, x10, x2 __LF \ + sbcs x11, x11, x3 __LF \ + sbcs x8, x8, x4 __LF \ + sbc x9, x9, x5 __LF \ + lsl x4, x10, #32 __LF \ + lsr x5, x10, #32 __LF \ + subs x2, x4, x10 __LF \ + sbc x3, x5, xzr __LF \ + subs x11, x11, x2 __LF \ + sbcs x8, x8, x3 __LF \ + sbcs x9, x9, x4 __LF \ + sbc x10, x10, x5 __LF \ + lsl x4, x11, #32 __LF \ + lsr x5, x11, #32 __LF \ + subs x2, x4, x11 __LF \ + sbc x3, x5, xzr __LF \ + subs x8, x8, x2 __LF \ + sbcs x9, x9, x3 __LF \ + sbcs x10, x10, x4 __LF \ + sbc x11, x11, x5 __LF \ + adds x8, x8, x12 __LF \ + adcs x9, x9, x13 __LF \ + adcs x10, x10, x14 __LF \ + adcs x11, x11, x7 __LF \ + cset x2, cs __LF \ + mov x3, #0xffffffff00000000 __LF \ + mov x5, #0xfffffffeffffffff __LF \ + adds x12, x8, #0x1 __LF \ + sbcs x13, x9, x3 __LF \ + adcs x14, x10, xzr __LF \ + sbcs x7, x11, x5 __LF \ + sbcs xzr, x2, xzr __LF \ + csel x8, x8, x12, cc __LF \ + csel x9, x9, x13, cc __LF \ + csel x10, x10, x14, cc __LF \ + csel x11, x11, x7, cc __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Corresponds exactly to bignum_sub_sm2 + +#define sub_sm2(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + csetm x3, cc __LF \ + adds x5, x5, x3 __LF \ + and x4, x3, #0xffffffff00000000 __LF \ + adcs x6, x6, x4 __LF \ + adcs x7, x7, x3 __LF \ + and x4, x3, #0xfffffffeffffffff __LF \ + adc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +// Corresponds exactly to bignum_add_sm2 + +#define add_sm2(P0,P1,P2) \ + ldp x4, x5, [P1] __LF \ + ldp x8, x9, [P2] __LF \ + adds x4, x4, x8 __LF \ + adcs x5, x5, x9 __LF \ + ldp x6, x7, [P1+16] __LF \ + ldp x10, x11, [P2+16] __LF \ + adcs x6, x6, x10 __LF \ + adcs x7, x7, x11 __LF \ + adc x3, xzr, xzr __LF \ + adds x8, x4, #0x1 __LF \ + mov x9, #0xffffffff00000000 __LF \ + sbcs x9, x5, x9 __LF \ + adcs x10, x6, xzr __LF \ + mov x11, #0xfffffffeffffffff __LF \ + sbcs x11, x7, x11 __LF \ + sbcs x3, x3, xzr __LF \ + csel x4, x4, x8, cc __LF \ + csel x5, x5, x9, cc __LF \ + csel x6, x6, x10, cc __LF \ + csel x7, x7, x11, cc __LF \ + stp x4, x5, [P0] __LF \ + stp x6, x7, [P0+16] + +// A weak version of add that only guarantees sum in 4 digits + +#define weakadd_sm2(P0,P1,P2) \ + ldp x4, x5, [P1] __LF \ + ldp x8, x9, [P2] __LF \ + adds x4, x4, x8 __LF \ + adcs x5, x5, x9 __LF \ + ldp x6, x7, [P1+16] __LF \ + ldp x10, x11, [P2+16] __LF \ + adcs x6, x6, x10 __LF \ + adcs x7, x7, x11 __LF \ + csetm x2, cs __LF \ + subs x4, x4, x2 __LF \ + and x3, x2, #0xffffffff00000000 __LF \ + sbcs x5, x5, x3 __LF \ + and x1, x2, #0xfffffffeffffffff __LF \ + sbcs x6, x6, x2 __LF \ + sbc x7, x7, x1 __LF \ + stp x4, x5, [P0] __LF \ + stp x6, x7, [P0+16] + +// P0 = C * P1 - D * P2 computed as D * (p_sm2 - P2) + C * P1 +// Quotient estimation is done just as q = h + 1 as in bignum_triple_sm2 +// This also applies to the other functions following. + +#define cmsub_sm2(P0,C,P1,D,P2) \ + mov x1, D __LF \ + mov x2, #-1 __LF \ + ldp x9, x10, [P2] __LF \ + subs x9, x2, x9 __LF \ + mov x3, #0xffffffff00000000 __LF \ + sbcs x10, x3, x10 __LF \ + ldp x11, x12, [P2+16] __LF \ + sbcs x11, x2, x11 __LF \ + mov x4, #0xfffffffeffffffff __LF \ + sbc x12, x4, x12 __LF \ + mul x3, x1, x9 __LF \ + mul x4, x1, x10 __LF \ + mul x5, x1, x11 __LF \ + mul x6, x1, x12 __LF \ + umulh x9, x1, x9 __LF \ + umulh x10, x1, x10 __LF \ + umulh x11, x1, x11 __LF \ + umulh x7, x1, x12 __LF \ + adds x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + adcs x6, x6, x11 __LF \ + adc x7, x7, xzr __LF \ + mov x1, C __LF \ + ldp x9, x10, [P1] __LF \ + mul x8, x9, x1 __LF \ + umulh x9, x9, x1 __LF \ + adds x3, x3, x8 __LF \ + mul x8, x10, x1 __LF \ + umulh x10, x10, x1 __LF \ + adcs x4, x4, x8 __LF \ + ldp x11, x12, [P1+16] __LF \ + mul x8, x11, x1 __LF \ + umulh x11, x11, x1 __LF \ + adcs x5, x5, x8 __LF \ + mul x8, x12, x1 __LF \ + umulh x12, x12, x1 __LF \ + adcs x6, x6, x8 __LF \ + adc x7, x7, xzr __LF \ + adds x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + adcs x6, x6, x11 __LF \ + adc x7, x7, x12 __LF \ + add x7, x7, #0x1 __LF \ + lsl x8, x7, #32 __LF \ + sub x9, x8, x7 __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, xzr __LF \ + adcs x6, x6, x8 __LF \ + csetm x7, cc __LF \ + adds x3, x3, x7 __LF \ + and x9, x7, #0xffffffff00000000 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, x7 __LF \ + and x8, x7, #0xfffffffeffffffff __LF \ + adc x6, x6, x8 __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +// P0 = 4 * P1 - P2, by direct subtraction of P2; the method +// in bignum_cmul_sm2 etc. for quotient estimation still +// works when the value to be reduced is negative, as +// long as it is > -p_sm2, which is the case here. + +#define cmsub41_sm2(P0,P1,P2) \ + ldp x1, x2, [P1] __LF \ + lsl x0, x1, #2 __LF \ + ldp x6, x7, [P2] __LF \ + subs x0, x0, x6 __LF \ + extr x1, x2, x1, #62 __LF \ + sbcs x1, x1, x7 __LF \ + ldp x3, x4, [P1+16] __LF \ + extr x2, x3, x2, #62 __LF \ + ldp x6, x7, [P2+16] __LF \ + sbcs x2, x2, x6 __LF \ + extr x3, x4, x3, #62 __LF \ + sbcs x3, x3, x7 __LF \ + lsr x4, x4, #62 __LF \ + sbc x4, x4, xzr __LF \ + add x4, x4, #0x1 __LF \ + lsl x5, x4, #32 __LF \ + sub x6, x5, x4 __LF \ + adds x0, x0, x4 __LF \ + adcs x1, x1, x6 __LF \ + adcs x2, x2, xzr __LF \ + adcs x3, x3, x5 __LF \ + csetm x4, cc __LF \ + adds x0, x0, x4 __LF \ + and x6, x4, #0xffffffff00000000 __LF \ + adcs x1, x1, x6 __LF \ + adcs x2, x2, x4 __LF \ + and x5, x4, #0xfffffffeffffffff __LF \ + adc x3, x3, x5 __LF \ + stp x0, x1, [P0] __LF \ + stp x2, x3, [P0+16] + +// P0 = 3 * P1 - 8 * P2, computed as (p_sm2 - P2) << 3 + 3 * P1 + +#define cmsub38_sm2(P0,P1,P2) \ + mov x1, 8 __LF \ + mov x2, #-1 __LF \ + ldp x9, x10, [P2] __LF \ + subs x9, x2, x9 __LF \ + mov x3, #0xffffffff00000000 __LF \ + sbcs x10, x3, x10 __LF \ + ldp x11, x12, [P2+16] __LF \ + sbcs x11, x2, x11 __LF \ + mov x4, #0xfffffffeffffffff __LF \ + sbc x12, x4, x12 __LF \ + lsl x3, x9, #3 __LF \ + extr x4, x10, x9, #61 __LF \ + extr x5, x11, x10, #61 __LF \ + extr x6, x12, x11, #61 __LF \ + lsr x7, x12, #61 __LF \ + mov x1, 3 __LF \ + ldp x9, x10, [P1] __LF \ + mul x8, x9, x1 __LF \ + umulh x9, x9, x1 __LF \ + adds x3, x3, x8 __LF \ + mul x8, x10, x1 __LF \ + umulh x10, x10, x1 __LF \ + adcs x4, x4, x8 __LF \ + ldp x11, x12, [P1+16] __LF \ + mul x8, x11, x1 __LF \ + umulh x11, x11, x1 __LF \ + adcs x5, x5, x8 __LF \ + mul x8, x12, x1 __LF \ + umulh x12, x12, x1 __LF \ + adcs x6, x6, x8 __LF \ + adc x7, x7, xzr __LF \ + adds x4, x4, x9 __LF \ + adcs x5, x5, x10 __LF \ + adcs x6, x6, x11 __LF \ + adc x7, x7, x12 __LF \ + add x7, x7, #0x1 __LF \ + lsl x8, x7, #32 __LF \ + sub x9, x8, x7 __LF \ + adds x3, x3, x7 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, xzr __LF \ + adcs x6, x6, x8 __LF \ + csetm x7, cc __LF \ + adds x3, x3, x7 __LF \ + and x9, x7, #0xffffffff00000000 __LF \ + adcs x4, x4, x9 __LF \ + adcs x5, x5, x7 __LF \ + and x8, x7, #0xfffffffeffffffff __LF \ + adc x6, x6, x8 __LF \ + stp x3, x4, [P0] __LF \ + stp x5, x6, [P0+16] + +S2N_BN_SYMBOL(sm2_montjdouble_alt): + +// Make room on stack for temporary variables + + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_sm2(z2,z_1) + montsqr_sm2(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + sub_sm2(t2,x_1,z2) + weakadd_sm2(t1,x_1,z2) + montmul_sm2(x2p,t1,t2) + +// t1 = y + z +// xy2 = x * y^2 +// x4p = x2p^2 + + add_sm2(t1,y_1,z_1) + montmul_sm2(xy2,x_1,y2) + montsqr_sm2(x4p,x2p) + +// t1 = (y + z)^2 + + montsqr_sm2(t1,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_sm2(d,12,xy2,9,x4p) + sub_sm2(t1,t1,z2) + +// y4 = y^4 + + montsqr_sm2(y4,y2) + +// dx2 = d * x2p + + montmul_sm2(dx2,d,x2p) + +// z_3' = 2 * y * z + + sub_sm2(z_3,t1,y2) + +// x' = 4 * xy2 - d + + cmsub41_sm2(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_sm2(y_3,dx2,y4) + +// Restore stack and return + + add sp, sp, NSPACE + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjmixadd.S new file mode 100644 index 00000000000..9f7c13cf740 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjmixadd.S @@ -0,0 +1,501 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates +// +// extern void sm2_montjmixadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjmixadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjmixadd) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x17 +#define input_x x19 +#define input_y x20 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds to bignum_montmul_sm2 with x0 in place of x17 + +#define montmul_sm2(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x5, x6, [P1+16] __LF \ + ldp x7, x8, [P2] __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x7 __LF \ + mul x13, x4, x8 __LF \ + umulh x12, x3, x7 __LF \ + adds x16, x11, x13 __LF \ + umulh x14, x4, x8 __LF \ + adcs x0, x12, x14 __LF \ + adcs x14, x14, xzr __LF \ + adds x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adcs x14, x14, xzr __LF \ + subs x15, x3, x4 __LF \ + cneg x15, x15, lo __LF \ + csetm x1, lo __LF \ + subs x0, x8, x7 __LF \ + cneg x0, x0, lo __LF \ + mul x16, x15, x0 __LF \ + umulh x0, x15, x0 __LF \ + cinv x1, x1, lo __LF \ + eor x16, x16, x1 __LF \ + eor x0, x0, x1 __LF \ + cmn x1, #1 __LF \ + adcs x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adc x14, x14, x1 __LF \ + lsl x16, x11, #32 __LF \ + lsr x15, x11, #32 __LF \ + subs x1, x16, x11 __LF \ + sbc x0, x15, xzr __LF \ + subs x12, x12, x1 __LF \ + sbcs x13, x13, x0 __LF \ + sbcs x14, x14, x16 __LF \ + sbc x11, x11, x15 __LF \ + lsl x16, x12, #32 __LF \ + lsr x15, x12, #32 __LF \ + subs x1, x16, x12 __LF \ + sbc x0, x15, xzr __LF \ + subs x13, x13, x1 __LF \ + sbcs x14, x14, x0 __LF \ + sbcs x11, x11, x16 __LF \ + sbc x12, x12, x15 __LF \ + stp x13, x14, [P0] __LF \ + stp x11, x12, [P0+16] __LF \ + mul x11, x5, x9 __LF \ + mul x13, x6, x10 __LF \ + umulh x12, x5, x9 __LF \ + adds x16, x11, x13 __LF \ + umulh x14, x6, x10 __LF \ + adcs x0, x12, x14 __LF \ + adcs x14, x14, xzr __LF \ + adds x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adcs x14, x14, xzr __LF \ + subs x15, x5, x6 __LF \ + cneg x15, x15, lo __LF \ + csetm x1, lo __LF \ + subs x0, x10, x9 __LF \ + cneg x0, x0, lo __LF \ + mul x16, x15, x0 __LF \ + umulh x0, x15, x0 __LF \ + cinv x1, x1, lo __LF \ + eor x16, x16, x1 __LF \ + eor x0, x0, x1 __LF \ + cmn x1, #1 __LF \ + adcs x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adc x14, x14, x1 __LF \ + subs x3, x5, x3 __LF \ + sbcs x4, x6, x4 __LF \ + ngc x5, xzr __LF \ + cmn x5, #1 __LF \ + eor x3, x3, x5 __LF \ + adcs x3, x3, xzr __LF \ + eor x4, x4, x5 __LF \ + adcs x4, x4, xzr __LF \ + subs x7, x7, x9 __LF \ + sbcs x8, x8, x10 __LF \ + ngc x9, xzr __LF \ + cmn x9, #1 __LF \ + eor x7, x7, x9 __LF \ + adcs x7, x7, xzr __LF \ + eor x8, x8, x9 __LF \ + adcs x8, x8, xzr __LF \ + eor x10, x5, x9 __LF \ + ldp x15, x1, [P0] __LF \ + adds x15, x11, x15 __LF \ + adcs x1, x12, x1 __LF \ + ldp x5, x9, [P0+16] __LF \ + adcs x5, x13, x5 __LF \ + adcs x9, x14, x9 __LF \ + adc x2, xzr, xzr __LF \ + mul x11, x3, x7 __LF \ + mul x13, x4, x8 __LF \ + umulh x12, x3, x7 __LF \ + adds x16, x11, x13 __LF \ + umulh x14, x4, x8 __LF \ + adcs x0, x12, x14 __LF \ + adcs x14, x14, xzr __LF \ + adds x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adcs x14, x14, xzr __LF \ + subs x3, x3, x4 __LF \ + cneg x3, x3, lo __LF \ + csetm x4, lo __LF \ + subs x0, x8, x7 __LF \ + cneg x0, x0, lo __LF \ + mul x16, x3, x0 __LF \ + umulh x0, x3, x0 __LF \ + cinv x4, x4, lo __LF \ + eor x16, x16, x4 __LF \ + eor x0, x0, x4 __LF \ + cmn x4, #1 __LF \ + adcs x12, x12, x16 __LF \ + adcs x13, x13, x0 __LF \ + adc x14, x14, x4 __LF \ + cmn x10, #1 __LF \ + eor x11, x11, x10 __LF \ + adcs x11, x11, x15 __LF \ + eor x12, x12, x10 __LF \ + adcs x12, x12, x1 __LF \ + eor x13, x13, x10 __LF \ + adcs x13, x13, x5 __LF \ + eor x14, x14, x10 __LF \ + adcs x14, x14, x9 __LF \ + adcs x3, x2, x10 __LF \ + adcs x4, x10, xzr __LF \ + adc x10, x10, xzr __LF \ + adds x13, x13, x15 __LF \ + adcs x14, x14, x1 __LF \ + adcs x3, x3, x5 __LF \ + adcs x4, x4, x9 __LF \ + adc x10, x10, x2 __LF \ + lsl x16, x11, #32 __LF \ + lsr x15, x11, #32 __LF \ + subs x1, x16, x11 __LF \ + sbc x0, x15, xzr __LF \ + subs x12, x12, x1 __LF \ + sbcs x13, x13, x0 __LF \ + sbcs x14, x14, x16 __LF \ + sbc x11, x11, x15 __LF \ + lsl x16, x12, #32 __LF \ + lsr x15, x12, #32 __LF \ + subs x1, x16, x12 __LF \ + sbc x0, x15, xzr __LF \ + subs x13, x13, x1 __LF \ + sbcs x14, x14, x0 __LF \ + sbcs x11, x11, x16 __LF \ + sbc x12, x12, x15 __LF \ + adds x3, x3, x11 __LF \ + adcs x4, x4, x12 __LF \ + adc x10, x10, xzr __LF \ + add x2, x10, #1 __LF \ + lsl x15, x2, #32 __LF \ + sub x16, x15, x2 __LF \ + adds x13, x13, x2 __LF \ + adcs x14, x14, x16 __LF \ + adcs x3, x3, xzr __LF \ + adcs x4, x4, x15 __LF \ + csetm x7, lo __LF \ + adds x13, x13, x7 __LF \ + and x16, x7, #0xffffffff00000000 __LF \ + adcs x14, x14, x16 __LF \ + adcs x3, x3, x7 __LF \ + and x15, x7, #0xfffffffeffffffff __LF \ + adc x4, x4, x15 __LF \ + stp x13, x14, [P0] __LF \ + stp x3, x4, [P0+16] + +// Corresponds to bignum_montsqr_sm2 with x0 in place of x17 + +#define montsqr_sm2(P0,P1) \ + ldp x2, x3, [P1] __LF \ + ldp x4, x5, [P1+16] __LF \ + umull x15, w2, w2 __LF \ + lsr x11, x2, #32 __LF \ + umull x16, w11, w11 __LF \ + umull x11, w2, w11 __LF \ + adds x15, x15, x11, lsl #33 __LF \ + lsr x11, x11, #31 __LF \ + adc x16, x16, x11 __LF \ + umull x0, w3, w3 __LF \ + lsr x11, x3, #32 __LF \ + umull x1, w11, w11 __LF \ + umull x11, w3, w11 __LF \ + mul x12, x2, x3 __LF \ + umulh x13, x2, x3 __LF \ + adds x0, x0, x11, lsl #33 __LF \ + lsr x11, x11, #31 __LF \ + adc x1, x1, x11 __LF \ + adds x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adc x1, x1, xzr __LF \ + adds x16, x16, x12 __LF \ + adcs x0, x0, x13 __LF \ + adc x1, x1, xzr __LF \ + lsl x12, x15, #32 __LF \ + lsr x11, x15, #32 __LF \ + subs x14, x12, x15 __LF \ + sbc x13, x11, xzr __LF \ + subs x16, x16, x14 __LF \ + sbcs x0, x0, x13 __LF \ + sbcs x1, x1, x12 __LF \ + sbc x15, x15, x11 __LF \ + lsl x12, x16, #32 __LF \ + lsr x11, x16, #32 __LF \ + subs x14, x12, x16 __LF \ + sbc x13, x11, xzr __LF \ + subs x0, x0, x14 __LF \ + sbcs x1, x1, x13 __LF \ + sbcs x15, x15, x12 __LF \ + sbc x16, x16, x11 __LF \ + mul x6, x2, x4 __LF \ + mul x14, x3, x5 __LF \ + umulh x8, x2, x4 __LF \ + subs x10, x2, x3 __LF \ + cneg x10, x10, lo __LF \ + csetm x13, lo __LF \ + subs x12, x5, x4 __LF \ + cneg x12, x12, lo __LF \ + mul x11, x10, x12 __LF \ + umulh x12, x10, x12 __LF \ + cinv x13, x13, lo __LF \ + eor x11, x11, x13 __LF \ + eor x12, x12, x13 __LF \ + adds x7, x6, x8 __LF \ + adc x8, x8, xzr __LF \ + umulh x9, x3, x5 __LF \ + adds x7, x7, x14 __LF \ + adcs x8, x8, x9 __LF \ + adc x9, x9, xzr __LF \ + adds x8, x8, x14 __LF \ + adc x9, x9, xzr __LF \ + cmn x13, #1 __LF \ + adcs x7, x7, x11 __LF \ + adcs x8, x8, x12 __LF \ + adc x9, x9, x13 __LF \ + adds x6, x6, x6 __LF \ + adcs x7, x7, x7 __LF \ + adcs x8, x8, x8 __LF \ + adcs x9, x9, x9 __LF \ + adc x10, xzr, xzr __LF \ + adds x6, x6, x0 __LF \ + adcs x7, x7, x1 __LF \ + adcs x8, x8, x15 __LF \ + adcs x9, x9, x16 __LF \ + adc x10, x10, xzr __LF \ + lsl x12, x6, #32 __LF \ + lsr x11, x6, #32 __LF \ + subs x14, x12, x6 __LF \ + sbc x13, x11, xzr __LF \ + subs x7, x7, x14 __LF \ + sbcs x8, x8, x13 __LF \ + sbcs x9, x9, x12 __LF \ + sbc x14, x6, x11 __LF \ + adds x10, x10, x14 __LF \ + adc x6, xzr, xzr __LF \ + lsl x12, x7, #32 __LF \ + lsr x11, x7, #32 __LF \ + subs x14, x12, x7 __LF \ + sbc x13, x11, xzr __LF \ + subs x8, x8, x14 __LF \ + sbcs x9, x9, x13 __LF \ + sbcs x10, x10, x12 __LF \ + sbc x14, x7, x11 __LF \ + adds x6, x6, x14 __LF \ + adc x7, xzr, xzr __LF \ + mul x11, x4, x4 __LF \ + adds x8, x8, x11 __LF \ + mul x12, x5, x5 __LF \ + umulh x11, x4, x4 __LF \ + adcs x9, x9, x11 __LF \ + adcs x10, x10, x12 __LF \ + umulh x12, x5, x5 __LF \ + adcs x6, x6, x12 __LF \ + adc x7, x7, xzr __LF \ + mul x11, x4, x5 __LF \ + umulh x12, x4, x5 __LF \ + adds x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adc x13, xzr, xzr __LF \ + adds x9, x9, x11 __LF \ + adcs x10, x10, x12 __LF \ + adcs x6, x6, x13 __LF \ + adcs x7, x7, xzr __LF \ + mov x11, #-4294967296 __LF \ + adds x5, x8, #1 __LF \ + sbcs x11, x9, x11 __LF \ + mov x13, #-4294967297 __LF \ + adcs x12, x10, xzr __LF \ + sbcs x13, x6, x13 __LF \ + sbcs xzr, x7, xzr __LF \ + csel x8, x5, x8, hs __LF \ + csel x9, x11, x9, hs __LF \ + csel x10, x12, x10, hs __LF \ + csel x6, x13, x6, hs __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x6, [P0+16] + +// Corresponds exactly to bignum_sub_sm2 + +#define sub_sm2(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + csetm x3, cc __LF \ + adds x5, x5, x3 __LF \ + and x4, x3, #0xffffffff00000000 __LF \ + adcs x6, x6, x4 __LF \ + adcs x7, x7, x3 __LF \ + and x4, x3, #0xfffffffeffffffff __LF \ + adc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +S2N_BN_SYMBOL(sm2_montjmixadd): + +// Save regs and make room on stack for temporary variables + + stp x19, x20, [sp, #-16]! + sub sp, sp, NSPACE + +// Move the input arguments to stable places + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + montsqr_sm2(zp2,z_1) + montmul_sm2(y2a,z_1,y_2) + + montmul_sm2(x2a,zp2,x_2) + montmul_sm2(y2a,zp2,y2a) + + sub_sm2(xd,x2a,x_1) + sub_sm2(yd,y2a,y_1) + + montsqr_sm2(zz,xd) + montsqr_sm2(ww,yd) + + montmul_sm2(zzx1,zz,x_1) + montmul_sm2(zzx2,zz,x2a) + + sub_sm2(resx,ww,zzx1) + sub_sm2(t1,zzx2,zzx1) + + montmul_sm2(resz,xd,z_1) + + sub_sm2(resx,resx,zzx2) + + sub_sm2(t2,zzx1,resx) + + montmul_sm2(t1,t1,y_1) + montmul_sm2(t2,yd,t2) + + sub_sm2(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + orr x4, x0, x1 + orr x5, x2, x3 + orr x4, x4, x5 + cmp x4, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^256 - p_sm2), +// hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x12, x13, [x_2] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x2, x3, [resx+16] + ldp x12, x13, [x_2+16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + + ldp x4, x5, [resy] + ldp x12, x13, [y_2] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x6, x7, [resy+16] + ldp x12, x13, [y_2+16] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + + ldp x8, x9, [resz] + mov x12, #0x0000000000000001 + mov x13, #0x00000000ffffffff + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x10, x11, [resz+16] + mov x13, #0x0000000100000000 + csel x10, x10, xzr, ne + csel x11, x11, x13, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [y_3] + stp x6, x7, [y_3+16] + stp x8, x9, [z_3] + stp x10, x11, [z_3+16] + +// Restore registers and return + + add sp, sp, NSPACE + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjmixadd_alt.S new file mode 100644 index 00000000000..6c4efc1eb4e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjmixadd_alt.S @@ -0,0 +1,509 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates +// +// extern void sm2_montjmixadd_alt +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard ARM ABI: X0 = p3, X1 = p1, X2 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjmixadd_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence + +#define input_z x15 +#define input_x x16 +#define input_y x17 + +// Pointer-offset pairs for inputs and outputs + +#define x_1 input_x, #0 +#define y_1 input_x, #NUMSIZE +#define z_1 input_x, #(2*NUMSIZE) + +#define x_2 input_y, #0 +#define y_2 input_y, #NUMSIZE + +#define x_3 input_z, #0 +#define y_3 input_z, #NUMSIZE +#define z_3 input_z, #(2*NUMSIZE) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 sp, #(NUMSIZE*0) +#define ww sp, #(NUMSIZE*0) +#define resx sp, #(NUMSIZE*0) + +#define yd sp, #(NUMSIZE*1) +#define y2a sp, #(NUMSIZE*1) + +#define x2a sp, #(NUMSIZE*2) +#define zzx2 sp, #(NUMSIZE*2) + +#define zz sp, #(NUMSIZE*3) +#define t1 sp, #(NUMSIZE*3) + +#define t2 sp, #(NUMSIZE*4) +#define zzx1 sp, #(NUMSIZE*4) +#define resy sp, #(NUMSIZE*4) + +#define xd sp, #(NUMSIZE*5) +#define resz sp, #(NUMSIZE*5) + +#define NSPACE (NUMSIZE*6) + +// Corresponds to bignum_montmul_sm2_alt except for registers + +#define montmul_sm2(P0,P1,P2) \ + ldp x3, x4, [P1] __LF \ + ldp x7, x8, [P2] __LF \ + mul x12, x3, x7 __LF \ + umulh x13, x3, x7 __LF \ + mul x11, x3, x8 __LF \ + umulh x14, x3, x8 __LF \ + adds x13, x13, x11 __LF \ + ldp x9, x10, [P2+16] __LF \ + mul x11, x3, x9 __LF \ + umulh x0, x3, x9 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x3, x10 __LF \ + umulh x1, x3, x10 __LF \ + adcs x0, x0, x11 __LF \ + adc x1, x1, xzr __LF \ + ldp x5, x6, [P1+16] __LF \ + mul x11, x4, x7 __LF \ + adds x13, x13, x11 __LF \ + mul x11, x4, x8 __LF \ + adcs x14, x14, x11 __LF \ + mul x11, x4, x9 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x4, x10 __LF \ + adcs x1, x1, x11 __LF \ + umulh x3, x4, x10 __LF \ + adc x3, x3, xzr __LF \ + umulh x11, x4, x7 __LF \ + adds x14, x14, x11 __LF \ + umulh x11, x4, x8 __LF \ + adcs x0, x0, x11 __LF \ + umulh x11, x4, x9 __LF \ + adcs x1, x1, x11 __LF \ + adc x3, x3, xzr __LF \ + mul x11, x5, x7 __LF \ + adds x14, x14, x11 __LF \ + mul x11, x5, x8 __LF \ + adcs x0, x0, x11 __LF \ + mul x11, x5, x9 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x5, x10 __LF \ + adcs x3, x3, x11 __LF \ + umulh x4, x5, x10 __LF \ + adc x4, x4, xzr __LF \ + umulh x11, x5, x7 __LF \ + adds x0, x0, x11 __LF \ + umulh x11, x5, x8 __LF \ + adcs x1, x1, x11 __LF \ + umulh x11, x5, x9 __LF \ + adcs x3, x3, x11 __LF \ + adc x4, x4, xzr __LF \ + mul x11, x6, x7 __LF \ + adds x0, x0, x11 __LF \ + mul x11, x6, x8 __LF \ + adcs x1, x1, x11 __LF \ + mul x11, x6, x9 __LF \ + adcs x3, x3, x11 __LF \ + mul x11, x6, x10 __LF \ + adcs x4, x4, x11 __LF \ + umulh x5, x6, x10 __LF \ + adc x5, x5, xzr __LF \ + umulh x11, x6, x7 __LF \ + adds x1, x1, x11 __LF \ + umulh x11, x6, x8 __LF \ + adcs x3, x3, x11 __LF \ + umulh x11, x6, x9 __LF \ + adcs x4, x4, x11 __LF \ + adc x5, x5, xzr __LF \ + lsl x11, x12, #32 __LF \ + lsr x6, x12, #32 __LF \ + subs x8, x11, x12 __LF \ + sbc x7, x6, xzr __LF \ + subs x13, x13, x8 __LF \ + sbcs x14, x14, x7 __LF \ + sbcs x0, x0, x11 __LF \ + sbc x12, x12, x6 __LF \ + lsl x11, x13, #32 __LF \ + lsr x6, x13, #32 __LF \ + subs x8, x11, x13 __LF \ + sbc x7, x6, xzr __LF \ + subs x14, x14, x8 __LF \ + sbcs x0, x0, x7 __LF \ + sbcs x12, x12, x11 __LF \ + sbc x13, x13, x6 __LF \ + lsl x11, x14, #32 __LF \ + lsr x6, x14, #32 __LF \ + subs x8, x11, x14 __LF \ + sbc x7, x6, xzr __LF \ + subs x0, x0, x8 __LF \ + sbcs x12, x12, x7 __LF \ + sbcs x13, x13, x11 __LF \ + sbc x14, x14, x6 __LF \ + lsl x11, x0, #32 __LF \ + lsr x6, x0, #32 __LF \ + subs x8, x11, x0 __LF \ + sbc x7, x6, xzr __LF \ + subs x12, x12, x8 __LF \ + sbcs x13, x13, x7 __LF \ + sbcs x14, x14, x11 __LF \ + sbc x0, x0, x6 __LF \ + adds x12, x12, x1 __LF \ + adcs x13, x13, x3 __LF \ + adcs x14, x14, x4 __LF \ + adcs x0, x0, x5 __LF \ + cset x8, cs __LF \ + mov x11, #0xffffffff00000000 __LF \ + mov x6, #0xfffffffeffffffff __LF \ + adds x1, x12, #0x1 __LF \ + sbcs x3, x13, x11 __LF \ + adcs x4, x14, xzr __LF \ + sbcs x5, x0, x6 __LF \ + sbcs xzr, x8, xzr __LF \ + csel x12, x12, x1, cc __LF \ + csel x13, x13, x3, cc __LF \ + csel x14, x14, x4, cc __LF \ + csel x0, x0, x5, cc __LF \ + stp x12, x13, [P0] __LF \ + stp x14, x0, [P0+16] + +// Corresponds to bignum_montsqr_sm2_alt exactly + +#define montsqr_sm2(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x6, x2, x4 __LF \ + umulh x7, x2, x4 __LF \ + adds x10, x10, x6 __LF \ + adcs x11, x11, x7 __LF \ + mul x6, x3, x4 __LF \ + umulh x7, x3, x4 __LF \ + adc x7, x7, xzr __LF \ + adds x11, x11, x6 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x7 __LF \ + mul x6, x3, x5 __LF \ + umulh x7, x3, x5 __LF \ + adc x7, x7, xzr __LF \ + adds x12, x12, x6 __LF \ + adcs x13, x13, x7 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x7, cs __LF \ + umulh x6, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x6 __LF \ + mul x6, x3, x3 __LF \ + adcs x10, x10, x6 __LF \ + umulh x6, x3, x3 __LF \ + adcs x11, x11, x6 __LF \ + mul x6, x4, x4 __LF \ + adcs x12, x12, x6 __LF \ + umulh x6, x4, x4 __LF \ + adcs x13, x13, x6 __LF \ + mul x6, x5, x5 __LF \ + adcs x14, x14, x6 __LF \ + umulh x6, x5, x5 __LF \ + adc x7, x7, x6 __LF \ + lsl x4, x8, #32 __LF \ + lsr x5, x8, #32 __LF \ + subs x2, x4, x8 __LF \ + sbc x3, x5, xzr __LF \ + subs x9, x9, x2 __LF \ + sbcs x10, x10, x3 __LF \ + sbcs x11, x11, x4 __LF \ + sbc x8, x8, x5 __LF \ + lsl x4, x9, #32 __LF \ + lsr x5, x9, #32 __LF \ + subs x2, x4, x9 __LF \ + sbc x3, x5, xzr __LF \ + subs x10, x10, x2 __LF \ + sbcs x11, x11, x3 __LF \ + sbcs x8, x8, x4 __LF \ + sbc x9, x9, x5 __LF \ + lsl x4, x10, #32 __LF \ + lsr x5, x10, #32 __LF \ + subs x2, x4, x10 __LF \ + sbc x3, x5, xzr __LF \ + subs x11, x11, x2 __LF \ + sbcs x8, x8, x3 __LF \ + sbcs x9, x9, x4 __LF \ + sbc x10, x10, x5 __LF \ + lsl x4, x11, #32 __LF \ + lsr x5, x11, #32 __LF \ + subs x2, x4, x11 __LF \ + sbc x3, x5, xzr __LF \ + subs x8, x8, x2 __LF \ + sbcs x9, x9, x3 __LF \ + sbcs x10, x10, x4 __LF \ + sbc x11, x11, x5 __LF \ + adds x8, x8, x12 __LF \ + adcs x9, x9, x13 __LF \ + adcs x10, x10, x14 __LF \ + adcs x11, x11, x7 __LF \ + cset x2, cs __LF \ + mov x3, #0xffffffff00000000 __LF \ + mov x5, #0xfffffffeffffffff __LF \ + adds x12, x8, #0x1 __LF \ + sbcs x13, x9, x3 __LF \ + adcs x14, x10, xzr __LF \ + sbcs x7, x11, x5 __LF \ + sbcs xzr, x2, xzr __LF \ + csel x8, x8, x12, cc __LF \ + csel x9, x9, x13, cc __LF \ + csel x10, x10, x14, cc __LF \ + csel x11, x11, x7, cc __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). + +#define amontsqr_sm2(P0,P1) \ + ldp x2, x3, [P1] __LF \ + mul x9, x2, x3 __LF \ + umulh x10, x2, x3 __LF \ + ldp x4, x5, [P1+16] __LF \ + mul x11, x2, x5 __LF \ + umulh x12, x2, x5 __LF \ + mul x6, x2, x4 __LF \ + umulh x7, x2, x4 __LF \ + adds x10, x10, x6 __LF \ + adcs x11, x11, x7 __LF \ + mul x6, x3, x4 __LF \ + umulh x7, x3, x4 __LF \ + adc x7, x7, xzr __LF \ + adds x11, x11, x6 __LF \ + mul x13, x4, x5 __LF \ + umulh x14, x4, x5 __LF \ + adcs x12, x12, x7 __LF \ + mul x6, x3, x5 __LF \ + umulh x7, x3, x5 __LF \ + adc x7, x7, xzr __LF \ + adds x12, x12, x6 __LF \ + adcs x13, x13, x7 __LF \ + adc x14, x14, xzr __LF \ + adds x9, x9, x9 __LF \ + adcs x10, x10, x10 __LF \ + adcs x11, x11, x11 __LF \ + adcs x12, x12, x12 __LF \ + adcs x13, x13, x13 __LF \ + adcs x14, x14, x14 __LF \ + cset x7, cs __LF \ + umulh x6, x2, x2 __LF \ + mul x8, x2, x2 __LF \ + adds x9, x9, x6 __LF \ + mul x6, x3, x3 __LF \ + adcs x10, x10, x6 __LF \ + umulh x6, x3, x3 __LF \ + adcs x11, x11, x6 __LF \ + mul x6, x4, x4 __LF \ + adcs x12, x12, x6 __LF \ + umulh x6, x4, x4 __LF \ + adcs x13, x13, x6 __LF \ + mul x6, x5, x5 __LF \ + adcs x14, x14, x6 __LF \ + umulh x6, x5, x5 __LF \ + adc x7, x7, x6 __LF \ + lsl x4, x8, #32 __LF \ + lsr x5, x8, #32 __LF \ + subs x2, x4, x8 __LF \ + sbc x3, x5, xzr __LF \ + subs x9, x9, x2 __LF \ + sbcs x10, x10, x3 __LF \ + sbcs x11, x11, x4 __LF \ + sbc x8, x8, x5 __LF \ + lsl x4, x9, #32 __LF \ + lsr x5, x9, #32 __LF \ + subs x2, x4, x9 __LF \ + sbc x3, x5, xzr __LF \ + subs x10, x10, x2 __LF \ + sbcs x11, x11, x3 __LF \ + sbcs x8, x8, x4 __LF \ + sbc x9, x9, x5 __LF \ + lsl x4, x10, #32 __LF \ + lsr x5, x10, #32 __LF \ + subs x2, x4, x10 __LF \ + sbc x3, x5, xzr __LF \ + subs x11, x11, x2 __LF \ + sbcs x8, x8, x3 __LF \ + sbcs x9, x9, x4 __LF \ + sbc x10, x10, x5 __LF \ + lsl x4, x11, #32 __LF \ + lsr x5, x11, #32 __LF \ + subs x2, x4, x11 __LF \ + sbc x3, x5, xzr __LF \ + subs x8, x8, x2 __LF \ + sbcs x9, x9, x3 __LF \ + sbcs x10, x10, x4 __LF \ + sbc x11, x11, x5 __LF \ + adds x8, x8, x12 __LF \ + adcs x9, x9, x13 __LF \ + adcs x10, x10, x14 __LF \ + adcs x11, x11, x7 __LF \ + csetm x2, cs __LF \ + subs x8, x8, x2 __LF \ + and x3, x2, #0xffffffff00000000 __LF \ + sbcs x9, x9, x3 __LF \ + and x5, x2, #0xfffffffeffffffff __LF \ + sbcs x10, x10, x2 __LF \ + sbc x11, x11, x5 __LF \ + stp x8, x9, [P0] __LF \ + stp x10, x11, [P0+16] + +// Corresponds exactly to bignum_sub_sm2 + +#define sub_sm2(P0,P1,P2) \ + ldp x5, x6, [P1] __LF \ + ldp x4, x3, [P2] __LF \ + subs x5, x5, x4 __LF \ + sbcs x6, x6, x3 __LF \ + ldp x7, x8, [P1+16] __LF \ + ldp x4, x3, [P2+16] __LF \ + sbcs x7, x7, x4 __LF \ + sbcs x8, x8, x3 __LF \ + csetm x3, cc __LF \ + adds x5, x5, x3 __LF \ + and x4, x3, #0xffffffff00000000 __LF \ + adcs x6, x6, x4 __LF \ + adcs x7, x7, x3 __LF \ + and x4, x3, #0xfffffffeffffffff __LF \ + adc x8, x8, x4 __LF \ + stp x5, x6, [P0] __LF \ + stp x7, x8, [P0+16] + +S2N_BN_SYMBOL(sm2_montjmixadd_alt): + +// Make room on stack for temporary variables +// Move the input arguments to stable places + + sub sp, sp, NSPACE + + mov input_z, x0 + mov input_x, x1 + mov input_y, x2 + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_sm2(zp2,z_1) + montmul_sm2(y2a,z_1,y_2) + + montmul_sm2(x2a,zp2,x_2) + montmul_sm2(y2a,zp2,y2a) + + sub_sm2(xd,x2a,x_1) + sub_sm2(yd,y2a,y_1) + + amontsqr_sm2(zz,xd) + montsqr_sm2(ww,yd) + + montmul_sm2(zzx1,zz,x_1) + montmul_sm2(zzx2,zz,x2a) + + sub_sm2(resx,ww,zzx1) + sub_sm2(t1,zzx2,zzx1) + + montmul_sm2(resz,xd,z_1) + + sub_sm2(resx,resx,zzx2) + + sub_sm2(t2,zzx1,resx) + + montmul_sm2(t1,t1,y_1) + montmul_sm2(t2,yd,t2) + + sub_sm2(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + ldp x0, x1, [z_1] + ldp x2, x3, [z_1+16] + orr x4, x0, x1 + orr x5, x2, x3 + orr x4, x4, x5 + cmp x4, xzr + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^256 - p_sm2), +// hence giving 0 + p2 = p2 for the final result. + + ldp x0, x1, [resx] + ldp x12, x13, [x_2] + csel x0, x0, x12, ne + csel x1, x1, x13, ne + ldp x2, x3, [resx+16] + ldp x12, x13, [x_2+16] + csel x2, x2, x12, ne + csel x3, x3, x13, ne + + ldp x4, x5, [resy] + ldp x12, x13, [y_2] + csel x4, x4, x12, ne + csel x5, x5, x13, ne + ldp x6, x7, [resy+16] + ldp x12, x13, [y_2+16] + csel x6, x6, x12, ne + csel x7, x7, x13, ne + + ldp x8, x9, [resz] + mov x12, #0x0000000000000001 + mov x13, #0x00000000ffffffff + csel x8, x8, x12, ne + csel x9, x9, x13, ne + ldp x10, x11, [resz+16] + mov x13, #0x0000000100000000 + csel x10, x10, xzr, ne + csel x11, x11, x13, ne + + stp x0, x1, [x_3] + stp x2, x3, [x_3+16] + stp x4, x5, [y_3] + stp x6, x7, [y_3+16] + stp x8, x9, [z_3] + stp x10, x11, [z_3+16] + +// Restore stack and return + + add sp, sp, NSPACE + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjscalarmul.S new file mode 100644 index 00000000000..b86545851de --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjscalarmul.S @@ -0,0 +1,4498 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery-Jacobian form scalar multiplication for GM/T 0003-2012 curve SM2 +// Input scalar[4], point[12]; output res[12] +// +// extern void sm2_montjscalarmul +// (uint64_t res[static 12], +// uint64_t scalar[static 4], +// uint64_t point[static 12]); +// +// This function is a variant of its affine point version sm2_scalarmul. +// Here, input and output points are assumed to be in Jacobian form with +// their coordinates in the Montgomery domain. Thus, if priming indicates +// Montgomery form, x' = (2^256 * x) mod p_sm2 etc., each point argument +// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when +// z' is nonzero or the point at infinity (group identity) if z' = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve SM2, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_sm2) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjscalarmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjscalarmul) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Safe copies of inputs (res lasts the whole code, point not so long) +// and additional values in variables, with some aliasing + +#define res x19 +#define sgn x20 +#define j x20 +#define point x21 + +// Intermediate variables on the stack. + +#define scalarb sp, #(0*NUMSIZE) +#define acc sp, #(1*NUMSIZE) +#define tabent sp, #(4*NUMSIZE) + +#define tab sp, #(7*NUMSIZE) + +#define NSPACE #(31*NUMSIZE) + +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmp x14, #(1*I) __LF \ + ldp x12, x13, [x15] __LF \ + csel x0, x12, x0, eq __LF \ + csel x1, x13, x1, eq __LF \ + ldp x12, x13, [x15, #16] __LF \ + csel x2, x12, x2, eq __LF \ + csel x3, x13, x3, eq __LF \ + ldp x12, x13, [x15, #32] __LF \ + csel x4, x12, x4, eq __LF \ + csel x5, x13, x5, eq __LF \ + ldp x12, x13, [x15, #48] __LF \ + csel x6, x12, x6, eq __LF \ + csel x7, x13, x7, eq __LF \ + ldp x12, x13, [x15, #64] __LF \ + csel x8, x12, x8, eq __LF \ + csel x9, x13, x9, eq __LF \ + ldp x12, x13, [x15, #80] __LF \ + csel x10, x12, x10, eq __LF \ + csel x11, x13, x11, eq __LF \ + add x15, x15, #96 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(sm2_montjscalarmul): + + stp x19, x20, [sp, #-16]! + stp x21, x30, [sp, #-16]! + sub sp, sp, NSPACE + +// Preserve the "res" and "point" input arguments. We load and process the +// scalar immediately so we don't bother preserving that input argument. +// Also, "point" is only needed early on and so its register gets re-used. + + mov res, x0 + mov point, x2 + +// Load the digits of group order n_sm2 = [x12;x13;x14;x15] + + movbig(x12, #0x53bb, #0xf409, #0x39d5, #0x4123) + movbig(x13, #0x7203, #0xdf6b, #0x21c6, #0x052b) + mov x14, #0xffffffffffffffff + mov x15, #0xfffffffeffffffff + +// First, reduce the input scalar mod n_sm2, i.e. conditionally subtract n_sm2 + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + + subs x6, x2, x12 + sbcs x7, x3, x13 + sbcs x8, x4, x14 + sbcs x9, x5, x15 + + csel x2, x2, x6, cc + csel x3, x3, x7, cc + csel x4, x4, x8, cc + csel x5, x5, x9, cc + +// Now if the top bit of the reduced scalar is set, negate it mod n_sm2, +// i.e. do n |-> n_sm2 - n. Remember the sign as "sgn" so we can +// correspondingly negate the point below. + + subs x6, x12, x2 + sbcs x7, x13, x3 + sbcs x8, x14, x4 + sbc x9, x15, x5 + + tst x5, #0x8000000000000000 + csel x2, x2, x6, eq + csel x3, x3, x7, eq + csel x4, x4, x8, eq + csel x5, x5, x9, eq + cset sgn, ne + +// In either case then add the recoding constant 0x08888...888 to allow +// signed digits. + + mov x6, 0x8888888888888888 + adds x2, x2, x6 + adcs x3, x3, x6 + bic x7, x6, #0xF000000000000000 + adcs x4, x4, x6 + adc x5, x5, x7 + + stp x2, x3, [scalarb] + stp x4, x5, [scalarb+16] + +// Set the tab[0] table entry to the input point = 1 * P, except +// that we negate it if the top bit of the scalar was set. This +// negation takes care over the y = 0 case to maintain all the +// coordinates < p_sm2 throughout, even though triples (x,y,z) +// with y = 0 can only represent a point on the curve when z = 0 +// and it represents the point at infinity regardless of x and y. + + ldp x0, x1, [point] + stp x0, x1, [tab] + ldp x2, x3, [point, #16] + stp x2, x3, [tab+16] + + ldp x4, x5, [point, #32] + ldp x6, x7, [point, #48] + + mov x0, #0xffffffffffffffff + subs x0, x0, x4 + mov x1, #0xffffffff00000000 + sbcs x1, x1, x5 + mov x2, #0xffffffffffffffff + sbcs x2, x2, x6 + mov x3, #0xfffffffeffffffff + sbc x3, x3, x7 + + orr x8, x4, x5 + orr x9, x6, x7 + orr x8, x8, x9 + cmp x8, xzr + ccmp sgn, xzr, #4, ne + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tab+32] + stp x6, x7, [tab+48] + + ldp x0, x1, [point, #64] + stp x0, x1, [tab+64] + ldp x2, x3, [point, #80] + stp x2, x3, [tab+80] + +// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P + + add x0, tab+96*1 + add x1, tab + bl sm2_montjscalarmul_sm2_montjdouble + + add x0, tab+96*2 + add x1, tab+96*1 + add x2, tab + bl sm2_montjscalarmul_sm2_montjadd + + add x0, tab+96*3 + add x1, tab+96*1 + bl sm2_montjscalarmul_sm2_montjdouble + + add x0, tab+96*4 + add x1, tab+96*3 + add x2, tab + bl sm2_montjscalarmul_sm2_montjadd + + add x0, tab+96*5 + add x1, tab+96*2 + bl sm2_montjscalarmul_sm2_montjdouble + + add x0, tab+96*6 + add x1, tab+96*5 + add x2, tab + bl sm2_montjscalarmul_sm2_montjadd + + add x0, tab+96*7 + add x1, tab+96*3 + bl sm2_montjscalarmul_sm2_montjdouble + +// Initialize the accumulator as a table entry for top 4 bits (unrecoded) + + ldr x14, [scalarb+24] + lsr x14, x14, #60 + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + add x15, tab + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + + stp x0, x1, [acc] + stp x2, x3, [acc+16] + stp x4, x5, [acc+32] + stp x6, x7, [acc+48] + stp x8, x9, [acc+64] + stp x10, x11, [acc+80] + + mov j, #252 + +// Main loop over size-4 bitfields: double 4 times then add signed digit + +sm2_montjscalarmul_mainloop: + sub j, j, #4 + + add x0, acc + add x1, acc + bl sm2_montjscalarmul_sm2_montjdouble + + add x0, acc + add x1, acc + bl sm2_montjscalarmul_sm2_montjdouble + + add x0, acc + add x1, acc + bl sm2_montjscalarmul_sm2_montjdouble + + add x0, acc + add x1, acc + bl sm2_montjscalarmul_sm2_montjdouble + + lsr x2, j, #6 + ldr x14, [sp, x2, lsl #3] // Exploits scalarb = sp exactly + lsr x14, x14, j + and x14, x14, #15 + + subs x14, x14, #8 + cset x16, lo // x16 = sign of digit (1 = negative) + cneg x14, x14, lo // x14 = absolute value of digit + +// Conditionally select the table entry tab[i-1] = i * P in constant time + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + add x15, tab + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + +// Store it to "tabent" with the y coordinate optionally negated +// Again, do it carefully to give coordinates < p_sm2 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + + mov x0, #0xffffffffffffffff + subs x0, x0, x4 + mov x1, #0xffffffff00000000 + sbcs x1, x1, x5 + mov x2, #0xffffffffffffffff + sbcs x2, x2, x6 + mov x3, #0xfffffffeffffffff + sbc x3, x3, x7 + + orr x12, x4, x5 + orr x13, x6, x7 + orr x12, x12, x13 + cmp x12, xzr + ccmp x16, xzr, #4, ne + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + stp x8, x9, [tabent+64] + stp x10, x11, [tabent+80] + + add x0, acc + add x1, acc + add x2, tabent + bl sm2_montjscalarmul_sm2_montjadd + + cbnz j, sm2_montjscalarmul_mainloop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + ldp x0, x1, [acc] + stp x0, x1, [res] + ldp x0, x1, [acc+16] + stp x0, x1, [res, #16] + ldp x0, x1, [acc+32] + stp x0, x1, [res, #32] + ldp x0, x1, [acc+48] + stp x0, x1, [res, #48] + ldp x0, x1, [acc+64] + stp x0, x1, [res, #64] + ldp x0, x1, [acc+80] + stp x0, x1, [res, #80] + +// Restore stack and registers and return + + add sp, sp, NSPACE + ldp x21, x30, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// Local copies of subroutines, complete clones at the moment + +sm2_montjscalarmul_sm2_montjadd: + stp x19, x20, [sp, #-0x10]! + sub sp, sp, #0xe0 + mov x17, x0 + mov x19, x1 + mov x20, x2 + ldp x2, x3, [x19, #0x40] + ldp x4, x5, [x19, #0x50] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x0, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x0, x0, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x0, x0, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + lsr x11, x15, #32 + subs x14, x12, x15 + sbc x13, x11, xzr + subs x16, x16, x14 + sbcs x0, x0, x13 + sbcs x1, x1, x12 + sbc x15, x15, x11 + lsl x12, x16, #32 + lsr x11, x16, #32 + subs x14, x12, x16 + sbc x13, x11, xzr + subs x0, x0, x14 + sbcs x1, x1, x13 + sbcs x15, x15, x12 + sbc x16, x16, x11 + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, lo + csetm x13, lo + subs x12, x5, x4 + cneg x12, x12, lo + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, lo + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x0 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + lsr x11, x6, #32 + subs x14, x12, x6 + sbc x13, x11, xzr + subs x7, x7, x14 + sbcs x8, x8, x13 + sbcs x9, x9, x12 + sbc x14, x6, x11 + adds x10, x10, x14 + adc x6, xzr, xzr + lsl x12, x7, #32 + lsr x11, x7, #32 + subs x14, x12, x7 + sbc x13, x11, xzr + subs x8, x8, x14 + sbcs x9, x9, x13 + sbcs x10, x10, x12 + sbc x14, x7, x11 + adds x6, x6, x14 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #-0x100000000 + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #-0x100000001 + adcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, hs + csel x9, x11, x9, hs + csel x10, x12, x10, hs + csel x6, x13, x6, hs + stp x8, x9, [sp] + stp x10, x6, [sp, #0x10] + ldp x2, x3, [x20, #0x40] + ldp x4, x5, [x20, #0x50] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x0, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x0, x0, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x0, x0, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + lsr x11, x15, #32 + subs x14, x12, x15 + sbc x13, x11, xzr + subs x16, x16, x14 + sbcs x0, x0, x13 + sbcs x1, x1, x12 + sbc x15, x15, x11 + lsl x12, x16, #32 + lsr x11, x16, #32 + subs x14, x12, x16 + sbc x13, x11, xzr + subs x0, x0, x14 + sbcs x1, x1, x13 + sbcs x15, x15, x12 + sbc x16, x16, x11 + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, lo + csetm x13, lo + subs x12, x5, x4 + cneg x12, x12, lo + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, lo + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x0 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + lsr x11, x6, #32 + subs x14, x12, x6 + sbc x13, x11, xzr + subs x7, x7, x14 + sbcs x8, x8, x13 + sbcs x9, x9, x12 + sbc x14, x6, x11 + adds x10, x10, x14 + adc x6, xzr, xzr + lsl x12, x7, #32 + lsr x11, x7, #32 + subs x14, x12, x7 + sbc x13, x11, xzr + subs x8, x8, x14 + sbcs x9, x9, x13 + sbcs x10, x10, x12 + sbc x14, x7, x11 + adds x6, x6, x14 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #-0x100000000 + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #-0x100000001 + adcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, hs + csel x9, x11, x9, hs + csel x10, x12, x10, hs + csel x6, x13, x6, hs + stp x8, x9, [sp, #0xa0] + stp x10, x6, [sp, #0xb0] + ldp x3, x4, [x20, #0x40] + ldp x5, x6, [x20, #0x50] + ldp x7, x8, [x19, #0x20] + ldp x9, x10, [x19, #0x30] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0xc0] + stp x11, x12, [sp, #0xd0] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x0, x10, x9 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0xc0] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0xd0] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, lo + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0xc0] + stp x3, x4, [sp, #0xd0] + ldp x3, x4, [x19, #0x40] + ldp x5, x6, [x19, #0x50] + ldp x7, x8, [x20, #0x20] + ldp x9, x10, [x20, #0x30] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0x20] + stp x11, x12, [sp, #0x30] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x0, x10, x9 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0x20] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0x30] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, lo + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0x20] + stp x3, x4, [sp, #0x30] + ldp x3, x4, [sp] + ldp x5, x6, [sp, #0x10] + ldp x7, x8, [x20] + ldp x9, x10, [x20, #0x10] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0x40] + stp x11, x12, [sp, #0x50] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x0, x10, x9 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0x40] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0x50] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, lo + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0x40] + stp x3, x4, [sp, #0x50] + ldp x3, x4, [sp, #0xa0] + ldp x5, x6, [sp, #0xb0] + ldp x7, x8, [x19] + ldp x9, x10, [x19, #0x10] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0x80] + stp x11, x12, [sp, #0x90] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x0, x10, x9 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0x80] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0x90] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, lo + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0x80] + stp x3, x4, [sp, #0x90] + ldp x3, x4, [sp] + ldp x5, x6, [sp, #0x10] + ldp x7, x8, [sp, #0x20] + ldp x9, x10, [sp, #0x30] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0x20] + stp x11, x12, [sp, #0x30] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x0, x10, x9 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0x20] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0x30] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, lo + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0x20] + stp x3, x4, [sp, #0x30] + ldp x3, x4, [sp, #0xa0] + ldp x5, x6, [sp, #0xb0] + ldp x7, x8, [sp, #0xc0] + ldp x9, x10, [sp, #0xd0] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0xc0] + stp x11, x12, [sp, #0xd0] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x0, x10, x9 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0xc0] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0xd0] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, lo + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0xc0] + stp x3, x4, [sp, #0xd0] + ldp x5, x6, [sp, #0x40] + ldp x4, x3, [sp, #0x80] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x50] + ldp x4, x3, [sp, #0x90] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0xa0] + stp x7, x8, [sp, #0xb0] + ldp x5, x6, [sp, #0x20] + ldp x4, x3, [sp, #0xc0] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x30] + ldp x4, x3, [sp, #0xd0] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0x20] + stp x7, x8, [sp, #0x30] + ldp x2, x3, [sp, #0xa0] + ldp x4, x5, [sp, #0xb0] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x0, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x0, x0, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x0, x0, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + lsr x11, x15, #32 + subs x14, x12, x15 + sbc x13, x11, xzr + subs x16, x16, x14 + sbcs x0, x0, x13 + sbcs x1, x1, x12 + sbc x15, x15, x11 + lsl x12, x16, #32 + lsr x11, x16, #32 + subs x14, x12, x16 + sbc x13, x11, xzr + subs x0, x0, x14 + sbcs x1, x1, x13 + sbcs x15, x15, x12 + sbc x16, x16, x11 + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, lo + csetm x13, lo + subs x12, x5, x4 + cneg x12, x12, lo + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, lo + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x0 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + lsr x11, x6, #32 + subs x14, x12, x6 + sbc x13, x11, xzr + subs x7, x7, x14 + sbcs x8, x8, x13 + sbcs x9, x9, x12 + sbc x14, x6, x11 + adds x10, x10, x14 + adc x6, xzr, xzr + lsl x12, x7, #32 + lsr x11, x7, #32 + subs x14, x12, x7 + sbc x13, x11, xzr + subs x8, x8, x14 + sbcs x9, x9, x13 + sbcs x10, x10, x12 + sbc x14, x7, x11 + adds x6, x6, x14 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #-0x100000000 + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #-0x100000001 + adcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, hs + csel x9, x11, x9, hs + csel x10, x12, x10, hs + csel x6, x13, x6, hs + stp x8, x9, [sp, #0x60] + stp x10, x6, [sp, #0x70] + ldp x2, x3, [sp, #0x20] + ldp x4, x5, [sp, #0x30] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x0, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x0, x0, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x0, x0, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + lsr x11, x15, #32 + subs x14, x12, x15 + sbc x13, x11, xzr + subs x16, x16, x14 + sbcs x0, x0, x13 + sbcs x1, x1, x12 + sbc x15, x15, x11 + lsl x12, x16, #32 + lsr x11, x16, #32 + subs x14, x12, x16 + sbc x13, x11, xzr + subs x0, x0, x14 + sbcs x1, x1, x13 + sbcs x15, x15, x12 + sbc x16, x16, x11 + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, lo + csetm x13, lo + subs x12, x5, x4 + cneg x12, x12, lo + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, lo + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x0 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + lsr x11, x6, #32 + subs x14, x12, x6 + sbc x13, x11, xzr + subs x7, x7, x14 + sbcs x8, x8, x13 + sbcs x9, x9, x12 + sbc x14, x6, x11 + adds x10, x10, x14 + adc x6, xzr, xzr + lsl x12, x7, #32 + lsr x11, x7, #32 + subs x14, x12, x7 + sbc x13, x11, xzr + subs x8, x8, x14 + sbcs x9, x9, x13 + sbcs x10, x10, x12 + sbc x14, x7, x11 + adds x6, x6, x14 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #-0x100000000 + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #-0x100000001 + adcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, hs + csel x9, x11, x9, hs + csel x10, x12, x10, hs + csel x6, x13, x6, hs + stp x8, x9, [sp] + stp x10, x6, [sp, #0x10] + ldp x3, x4, [sp, #0x60] + ldp x5, x6, [sp, #0x70] + ldp x7, x8, [sp, #0x80] + ldp x9, x10, [sp, #0x90] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0x80] + stp x11, x12, [sp, #0x90] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x0, x10, x9 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0x80] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0x90] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, lo + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0x80] + stp x3, x4, [sp, #0x90] + ldp x3, x4, [sp, #0x60] + ldp x5, x6, [sp, #0x70] + ldp x7, x8, [sp, #0x40] + ldp x9, x10, [sp, #0x50] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0x40] + stp x11, x12, [sp, #0x50] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x0, x10, x9 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0x40] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0x50] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, lo + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0x40] + stp x3, x4, [sp, #0x50] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #0x80] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x10] + ldp x4, x3, [sp, #0x90] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #0x10] + ldp x5, x6, [sp, #0x40] + ldp x4, x3, [sp, #0x80] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x50] + ldp x4, x3, [sp, #0x90] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0x60] + stp x7, x8, [sp, #0x70] + ldp x3, x4, [sp, #0xa0] + ldp x5, x6, [sp, #0xb0] + ldp x7, x8, [x19, #0x40] + ldp x9, x10, [x19, #0x50] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0xa0] + stp x11, x12, [sp, #0xb0] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x0, x10, x9 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0xa0] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0xb0] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, lo + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0xa0] + stp x3, x4, [sp, #0xb0] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #0x40] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x10] + ldp x4, x3, [sp, #0x50] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #0x10] + ldp x5, x6, [sp, #0x80] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x90] + ldp x4, x3, [sp, #0x10] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0x80] + stp x7, x8, [sp, #0x90] + ldp x3, x4, [sp, #0x60] + ldp x5, x6, [sp, #0x70] + ldp x7, x8, [sp, #0xc0] + ldp x9, x10, [sp, #0xd0] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0x60] + stp x11, x12, [sp, #0x70] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x0, x10, x9 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0x60] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0x70] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, lo + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0x60] + stp x3, x4, [sp, #0x70] + ldp x3, x4, [sp, #0xa0] + ldp x5, x6, [sp, #0xb0] + ldp x7, x8, [x20, #0x40] + ldp x9, x10, [x20, #0x50] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0xa0] + stp x11, x12, [sp, #0xb0] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x0, x10, x9 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0xa0] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0xb0] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, lo + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0xa0] + stp x3, x4, [sp, #0xb0] + ldp x3, x4, [sp, #0x20] + ldp x5, x6, [sp, #0x30] + ldp x7, x8, [sp, #0x80] + ldp x9, x10, [sp, #0x90] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0x80] + stp x11, x12, [sp, #0x90] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x0, x10, x9 + cneg x0, x0, lo + mul x16, x15, x0 + umulh x0, x15, x0 + cinv x1, x1, lo + eor x16, x16, x1 + eor x0, x0, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0x80] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0x90] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x0, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x0 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x0, x8, x7 + cneg x0, x0, lo + mul x16, x3, x0 + umulh x0, x3, x0 + cinv x4, x4, lo + eor x16, x16, x4 + eor x0, x0, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x0 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x0, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x0 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x0, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x0 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0x80] + stp x3, x4, [sp, #0x90] + ldp x5, x6, [sp, #0x80] + ldp x4, x3, [sp, #0x60] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x90] + ldp x4, x3, [sp, #0x70] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0x80] + stp x7, x8, [sp, #0x90] + ldp x0, x1, [x19, #0x40] + ldp x2, x3, [x19, #0x50] + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne + ldp x4, x5, [x20, #0x40] + ldp x6, x7, [x20, #0x50] + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne + cmp x13, x12 + ldp x8, x9, [sp, #0xa0] + csel x8, x0, x8, lo + csel x9, x1, x9, lo + csel x8, x4, x8, hi + csel x9, x5, x9, hi + ldp x10, x11, [sp, #0xb0] + csel x10, x2, x10, lo + csel x11, x3, x11, lo + csel x10, x6, x10, hi + csel x11, x7, x11, hi + ldp x12, x13, [x19] + ldp x0, x1, [sp] + csel x0, x12, x0, lo + csel x1, x13, x1, lo + ldp x12, x13, [x20] + csel x0, x12, x0, hi + csel x1, x13, x1, hi + ldp x12, x13, [x19, #0x10] + ldp x2, x3, [sp, #0x10] + csel x2, x12, x2, lo + csel x3, x13, x3, lo + ldp x12, x13, [x20, #0x10] + csel x2, x12, x2, hi + csel x3, x13, x3, hi + ldp x12, x13, [x19, #0x20] + ldp x4, x5, [sp, #0x80] + csel x4, x12, x4, lo + csel x5, x13, x5, lo + ldp x12, x13, [x20, #0x20] + csel x4, x12, x4, hi + csel x5, x13, x5, hi + ldp x12, x13, [x19, #0x30] + ldp x6, x7, [sp, #0x90] + csel x6, x12, x6, lo + csel x7, x13, x7, lo + ldp x12, x13, [x20, #0x30] + csel x6, x12, x6, hi + csel x7, x13, x7, hi + stp x0, x1, [x17] + stp x2, x3, [x17, #0x10] + stp x4, x5, [x17, #0x20] + stp x6, x7, [x17, #0x30] + stp x8, x9, [x17, #0x40] + stp x10, x11, [x17, #0x50] + add sp, sp, #0xe0 + ldp x19, x20, [sp], #0x10 + ret + +sm2_montjscalarmul_sm2_montjdouble: + sub sp, sp, #0xd0 + stp x19, x20, [sp, #0xc0] + mov x19, x0 + mov x20, x1 + ldp x2, x3, [x20, #0x40] + ldp x4, x5, [x20, #0x50] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x17, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x17, x17, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x17, x17, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + lsr x11, x15, #32 + subs x14, x12, x15 + sbc x13, x11, xzr + subs x16, x16, x14 + sbcs x17, x17, x13 + sbcs x1, x1, x12 + sbc x15, x15, x11 + lsl x12, x16, #32 + lsr x11, x16, #32 + subs x14, x12, x16 + sbc x13, x11, xzr + subs x17, x17, x14 + sbcs x1, x1, x13 + sbcs x15, x15, x12 + sbc x16, x16, x11 + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, lo + csetm x13, lo + subs x12, x5, x4 + cneg x12, x12, lo + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, lo + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x17 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + lsr x11, x6, #32 + subs x14, x12, x6 + sbc x13, x11, xzr + subs x7, x7, x14 + sbcs x8, x8, x13 + sbcs x9, x9, x12 + sbc x14, x6, x11 + adds x10, x10, x14 + adc x6, xzr, xzr + lsl x12, x7, #32 + lsr x11, x7, #32 + subs x14, x12, x7 + sbc x13, x11, xzr + subs x8, x8, x14 + sbcs x9, x9, x13 + sbcs x10, x10, x12 + sbc x14, x7, x11 + adds x6, x6, x14 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #-0x100000000 + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #-0x100000001 + adcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, hs + csel x9, x11, x9, hs + csel x10, x12, x10, hs + csel x6, x13, x6, hs + stp x8, x9, [sp] + stp x10, x6, [sp, #0x10] + ldp x2, x3, [x20, #0x20] + ldp x4, x5, [x20, #0x30] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x17, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x17, x17, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x17, x17, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + lsr x11, x15, #32 + subs x14, x12, x15 + sbc x13, x11, xzr + subs x16, x16, x14 + sbcs x17, x17, x13 + sbcs x1, x1, x12 + sbc x15, x15, x11 + lsl x12, x16, #32 + lsr x11, x16, #32 + subs x14, x12, x16 + sbc x13, x11, xzr + subs x17, x17, x14 + sbcs x1, x1, x13 + sbcs x15, x15, x12 + sbc x16, x16, x11 + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, lo + csetm x13, lo + subs x12, x5, x4 + cneg x12, x12, lo + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, lo + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x17 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + lsr x11, x6, #32 + subs x14, x12, x6 + sbc x13, x11, xzr + subs x7, x7, x14 + sbcs x8, x8, x13 + sbcs x9, x9, x12 + sbc x14, x6, x11 + adds x10, x10, x14 + adc x6, xzr, xzr + lsl x12, x7, #32 + lsr x11, x7, #32 + subs x14, x12, x7 + sbc x13, x11, xzr + subs x8, x8, x14 + sbcs x9, x9, x13 + sbcs x10, x10, x12 + sbc x14, x7, x11 + adds x6, x6, x14 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #-0x100000000 + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #-0x100000001 + adcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, hs + csel x9, x11, x9, hs + csel x10, x12, x10, hs + csel x6, x13, x6, hs + stp x8, x9, [sp, #0x20] + stp x10, x6, [sp, #0x30] + ldp x5, x6, [x20] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x20, #0x10] + ldp x4, x3, [sp, #0x10] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0x60] + stp x7, x8, [sp, #0x70] + ldp x4, x5, [x20] + ldp x8, x9, [sp] + adds x4, x4, x8 + adcs x5, x5, x9 + ldp x6, x7, [x20, #0x10] + ldp x10, x11, [sp, #0x10] + adcs x6, x6, x10 + adcs x7, x7, x11 + csetm x2, hs + subs x4, x4, x2 + and x3, x2, #0xffffffff00000000 + sbcs x5, x5, x3 + and x1, x2, #0xfffffffeffffffff + sbcs x6, x6, x2 + sbc x7, x7, x1 + stp x4, x5, [sp, #0x40] + stp x6, x7, [sp, #0x50] + ldp x3, x4, [sp, #0x40] + ldp x5, x6, [sp, #0x50] + ldp x7, x8, [sp, #0x60] + ldp x9, x10, [sp, #0x70] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x17, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x17, x8, x7 + cneg x17, x17, lo + mul x16, x15, x17 + umulh x17, x15, x17 + cinv x1, x1, lo + eor x16, x16, x1 + eor x17, x17, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x17 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x17, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x17 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x17, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x17 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0x60] + stp x11, x12, [sp, #0x70] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x17, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x17, x10, x9 + cneg x17, x17, lo + mul x16, x15, x17 + umulh x17, x15, x17 + cinv x1, x1, lo + eor x16, x16, x1 + eor x17, x17, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x17 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0x60] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0x70] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x17, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x17, x8, x7 + cneg x17, x17, lo + mul x16, x3, x17 + umulh x17, x3, x17 + cinv x4, x4, lo + eor x16, x16, x4 + eor x17, x17, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x17 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x17, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x17 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x17, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x17 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0x60] + stp x3, x4, [sp, #0x70] + ldp x4, x5, [x20, #0x20] + ldp x8, x9, [x20, #0x40] + adds x4, x4, x8 + adcs x5, x5, x9 + ldp x6, x7, [x20, #0x30] + ldp x10, x11, [x20, #0x50] + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x3, xzr, xzr + adds x8, x4, #0x1 + mov x9, #-0x100000000 + sbcs x9, x5, x9 + adcs x10, x6, xzr + mov x11, #-0x100000001 + sbcs x11, x7, x11 + sbcs x3, x3, xzr + csel x4, x4, x8, lo + csel x5, x5, x9, lo + csel x6, x6, x10, lo + csel x7, x7, x11, lo + stp x4, x5, [sp, #0x40] + stp x6, x7, [sp, #0x50] + ldp x3, x4, [x20] + ldp x5, x6, [x20, #0x10] + ldp x7, x8, [sp, #0x20] + ldp x9, x10, [sp, #0x30] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x17, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x17, x8, x7 + cneg x17, x17, lo + mul x16, x15, x17 + umulh x17, x15, x17 + cinv x1, x1, lo + eor x16, x16, x1 + eor x17, x17, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x17 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x17, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x17 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x17, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x17 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0x80] + stp x11, x12, [sp, #0x90] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x17, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x17, x10, x9 + cneg x17, x17, lo + mul x16, x15, x17 + umulh x17, x15, x17 + cinv x1, x1, lo + eor x16, x16, x1 + eor x17, x17, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x17 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0x80] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0x90] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x17, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x17, x8, x7 + cneg x17, x17, lo + mul x16, x3, x17 + umulh x17, x3, x17 + cinv x4, x4, lo + eor x16, x16, x4 + eor x17, x17, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x17 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x17, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x17 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x17, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x17 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0x80] + stp x3, x4, [sp, #0x90] + ldp x2, x3, [sp, #0x60] + ldp x4, x5, [sp, #0x70] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x17, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x17, x17, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x17, x17, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + lsr x11, x15, #32 + subs x14, x12, x15 + sbc x13, x11, xzr + subs x16, x16, x14 + sbcs x17, x17, x13 + sbcs x1, x1, x12 + sbc x15, x15, x11 + lsl x12, x16, #32 + lsr x11, x16, #32 + subs x14, x12, x16 + sbc x13, x11, xzr + subs x17, x17, x14 + sbcs x1, x1, x13 + sbcs x15, x15, x12 + sbc x16, x16, x11 + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, lo + csetm x13, lo + subs x12, x5, x4 + cneg x12, x12, lo + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, lo + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x17 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + lsr x11, x6, #32 + subs x14, x12, x6 + sbc x13, x11, xzr + subs x7, x7, x14 + sbcs x8, x8, x13 + sbcs x9, x9, x12 + sbc x14, x6, x11 + adds x10, x10, x14 + adc x6, xzr, xzr + lsl x12, x7, #32 + lsr x11, x7, #32 + subs x14, x12, x7 + sbc x13, x11, xzr + subs x8, x8, x14 + sbcs x9, x9, x13 + sbcs x10, x10, x12 + sbc x14, x7, x11 + adds x6, x6, x14 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #-0x100000000 + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #-0x100000001 + adcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, hs + csel x9, x11, x9, hs + csel x10, x12, x10, hs + csel x6, x13, x6, hs + stp x8, x9, [sp, #0xa0] + stp x10, x6, [sp, #0xb0] + ldp x2, x3, [sp, #0x40] + ldp x4, x5, [sp, #0x50] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x17, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x17, x17, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x17, x17, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + lsr x11, x15, #32 + subs x14, x12, x15 + sbc x13, x11, xzr + subs x16, x16, x14 + sbcs x17, x17, x13 + sbcs x1, x1, x12 + sbc x15, x15, x11 + lsl x12, x16, #32 + lsr x11, x16, #32 + subs x14, x12, x16 + sbc x13, x11, xzr + subs x17, x17, x14 + sbcs x1, x1, x13 + sbcs x15, x15, x12 + sbc x16, x16, x11 + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, lo + csetm x13, lo + subs x12, x5, x4 + cneg x12, x12, lo + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, lo + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x17 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + lsr x11, x6, #32 + subs x14, x12, x6 + sbc x13, x11, xzr + subs x7, x7, x14 + sbcs x8, x8, x13 + sbcs x9, x9, x12 + sbc x14, x6, x11 + adds x10, x10, x14 + adc x6, xzr, xzr + lsl x12, x7, #32 + lsr x11, x7, #32 + subs x14, x12, x7 + sbc x13, x11, xzr + subs x8, x8, x14 + sbcs x9, x9, x13 + sbcs x10, x10, x12 + sbc x14, x7, x11 + adds x6, x6, x14 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #-0x100000000 + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #-0x100000001 + adcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, hs + csel x9, x11, x9, hs + csel x10, x12, x10, hs + csel x6, x13, x6, hs + stp x8, x9, [sp, #0x40] + stp x10, x6, [sp, #0x50] + mov x1, #0x9 + mov x2, #-0x1 + ldp x9, x10, [sp, #0xa0] + subs x9, x2, x9 + mov x3, #-0x100000000 + sbcs x10, x3, x10 + ldp x11, x12, [sp, #0xb0] + sbcs x11, x2, x11 + mov x4, #-0x100000001 + sbc x12, x4, x12 + mul x3, x1, x9 + mul x4, x1, x10 + mul x5, x1, x11 + mul x6, x1, x12 + umulh x9, x1, x9 + umulh x10, x1, x10 + umulh x11, x1, x11 + umulh x7, x1, x12 + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, xzr + mov x1, #0xc + ldp x9, x10, [sp, #0x80] + mul x8, x9, x1 + umulh x9, x9, x1 + adds x3, x3, x8 + mul x8, x10, x1 + umulh x10, x10, x1 + adcs x4, x4, x8 + ldp x11, x12, [sp, #0x90] + mul x8, x11, x1 + umulh x11, x11, x1 + adcs x5, x5, x8 + mul x8, x12, x1 + umulh x12, x12, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x7, x7, #0x1 + lsl x8, x7, #32 + sub x9, x8, x7 + adds x3, x3, x7 + adcs x4, x4, x9 + adcs x5, x5, xzr + adcs x6, x6, x8 + csetm x7, lo + adds x3, x3, x7 + and x9, x7, #0xffffffff00000000 + adcs x4, x4, x9 + adcs x5, x5, x7 + and x8, x7, #0xfffffffeffffffff + adc x6, x6, x8 + stp x3, x4, [sp, #0xa0] + stp x5, x6, [sp, #0xb0] + ldp x5, x6, [sp, #0x40] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x50] + ldp x4, x3, [sp, #0x10] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0x40] + stp x7, x8, [sp, #0x50] + ldp x2, x3, [sp, #0x20] + ldp x4, x5, [sp, #0x30] + umull x15, w2, w2 + lsr x11, x2, #32 + umull x16, w11, w11 + umull x11, w2, w11 + adds x15, x15, x11, lsl #33 + lsr x11, x11, #31 + adc x16, x16, x11 + umull x17, w3, w3 + lsr x11, x3, #32 + umull x1, w11, w11 + umull x11, w3, w11 + mul x12, x2, x3 + umulh x13, x2, x3 + adds x17, x17, x11, lsl #33 + lsr x11, x11, #31 + adc x1, x1, x11 + adds x12, x12, x12 + adcs x13, x13, x13 + adc x1, x1, xzr + adds x16, x16, x12 + adcs x17, x17, x13 + adc x1, x1, xzr + lsl x12, x15, #32 + lsr x11, x15, #32 + subs x14, x12, x15 + sbc x13, x11, xzr + subs x16, x16, x14 + sbcs x17, x17, x13 + sbcs x1, x1, x12 + sbc x15, x15, x11 + lsl x12, x16, #32 + lsr x11, x16, #32 + subs x14, x12, x16 + sbc x13, x11, xzr + subs x17, x17, x14 + sbcs x1, x1, x13 + sbcs x15, x15, x12 + sbc x16, x16, x11 + mul x6, x2, x4 + mul x14, x3, x5 + umulh x8, x2, x4 + subs x10, x2, x3 + cneg x10, x10, lo + csetm x13, lo + subs x12, x5, x4 + cneg x12, x12, lo + mul x11, x10, x12 + umulh x12, x10, x12 + cinv x13, x13, lo + eor x11, x11, x13 + eor x12, x12, x13 + adds x7, x6, x8 + adc x8, x8, xzr + umulh x9, x3, x5 + adds x7, x7, x14 + adcs x8, x8, x9 + adc x9, x9, xzr + adds x8, x8, x14 + adc x9, x9, xzr + cmn x13, #0x1 + adcs x7, x7, x11 + adcs x8, x8, x12 + adc x9, x9, x13 + adds x6, x6, x6 + adcs x7, x7, x7 + adcs x8, x8, x8 + adcs x9, x9, x9 + adc x10, xzr, xzr + adds x6, x6, x17 + adcs x7, x7, x1 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, xzr + lsl x12, x6, #32 + lsr x11, x6, #32 + subs x14, x12, x6 + sbc x13, x11, xzr + subs x7, x7, x14 + sbcs x8, x8, x13 + sbcs x9, x9, x12 + sbc x14, x6, x11 + adds x10, x10, x14 + adc x6, xzr, xzr + lsl x12, x7, #32 + lsr x11, x7, #32 + subs x14, x12, x7 + sbc x13, x11, xzr + subs x8, x8, x14 + sbcs x9, x9, x13 + sbcs x10, x10, x12 + sbc x14, x7, x11 + adds x6, x6, x14 + adc x7, xzr, xzr + mul x11, x4, x4 + adds x8, x8, x11 + mul x12, x5, x5 + umulh x11, x4, x4 + adcs x9, x9, x11 + adcs x10, x10, x12 + umulh x12, x5, x5 + adcs x6, x6, x12 + adc x7, x7, xzr + mul x11, x4, x5 + umulh x12, x4, x5 + adds x11, x11, x11 + adcs x12, x12, x12 + adc x13, xzr, xzr + adds x9, x9, x11 + adcs x10, x10, x12 + adcs x6, x6, x13 + adcs x7, x7, xzr + mov x11, #-0x100000000 + adds x5, x8, #0x1 + sbcs x11, x9, x11 + mov x13, #-0x100000001 + adcs x12, x10, xzr + sbcs x13, x6, x13 + sbcs xzr, x7, xzr + csel x8, x5, x8, hs + csel x9, x11, x9, hs + csel x10, x12, x10, hs + csel x6, x13, x6, hs + stp x8, x9, [sp] + stp x10, x6, [sp, #0x10] + ldp x3, x4, [sp, #0xa0] + ldp x5, x6, [sp, #0xb0] + ldp x7, x8, [sp, #0x60] + ldp x9, x10, [sp, #0x70] + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x17, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, xzr + subs x15, x3, x4 + cneg x15, x15, lo + csetm x1, lo + subs x17, x8, x7 + cneg x17, x17, lo + mul x16, x15, x17 + umulh x17, x15, x17 + cinv x1, x1, lo + eor x16, x16, x1 + eor x17, x17, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x17 + adc x14, x14, x1 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x17, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x17 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x17, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x17 + sbcs x11, x11, x16 + sbc x12, x12, x15 + stp x13, x14, [sp, #0x60] + stp x11, x12, [sp, #0x70] + mul x11, x5, x9 + mul x13, x6, x10 + umulh x12, x5, x9 + adds x16, x11, x13 + umulh x14, x6, x10 + adcs x17, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, xzr + subs x15, x5, x6 + cneg x15, x15, lo + csetm x1, lo + subs x17, x10, x9 + cneg x17, x17, lo + mul x16, x15, x17 + umulh x17, x15, x17 + cinv x1, x1, lo + eor x16, x16, x1 + eor x17, x17, x1 + cmn x1, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x17 + adc x14, x14, x1 + subs x3, x5, x3 + sbcs x4, x6, x4 + ngc x5, xzr + cmn x5, #0x1 + eor x3, x3, x5 + adcs x3, x3, xzr + eor x4, x4, x5 + adcs x4, x4, xzr + subs x7, x7, x9 + sbcs x8, x8, x10 + ngc x9, xzr + cmn x9, #0x1 + eor x7, x7, x9 + adcs x7, x7, xzr + eor x8, x8, x9 + adcs x8, x8, xzr + eor x10, x5, x9 + ldp x15, x1, [sp, #0x60] + adds x15, x11, x15 + adcs x1, x12, x1 + ldp x5, x9, [sp, #0x70] + adcs x5, x13, x5 + adcs x9, x14, x9 + adc x2, xzr, xzr + mul x11, x3, x7 + mul x13, x4, x8 + umulh x12, x3, x7 + adds x16, x11, x13 + umulh x14, x4, x8 + adcs x17, x12, x14 + adcs x14, x14, xzr + adds x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, xzr + subs x3, x3, x4 + cneg x3, x3, lo + csetm x4, lo + subs x17, x8, x7 + cneg x17, x17, lo + mul x16, x3, x17 + umulh x17, x3, x17 + cinv x4, x4, lo + eor x16, x16, x4 + eor x17, x17, x4 + cmn x4, #0x1 + adcs x12, x12, x16 + adcs x13, x13, x17 + adc x14, x14, x4 + cmn x10, #0x1 + eor x11, x11, x10 + adcs x11, x11, x15 + eor x12, x12, x10 + adcs x12, x12, x1 + eor x13, x13, x10 + adcs x13, x13, x5 + eor x14, x14, x10 + adcs x14, x14, x9 + adcs x3, x2, x10 + adcs x4, x10, xzr + adc x10, x10, xzr + adds x13, x13, x15 + adcs x14, x14, x1 + adcs x3, x3, x5 + adcs x4, x4, x9 + adc x10, x10, x2 + lsl x16, x11, #32 + lsr x15, x11, #32 + subs x1, x16, x11 + sbc x17, x15, xzr + subs x12, x12, x1 + sbcs x13, x13, x17 + sbcs x14, x14, x16 + sbc x11, x11, x15 + lsl x16, x12, #32 + lsr x15, x12, #32 + subs x1, x16, x12 + sbc x17, x15, xzr + subs x13, x13, x1 + sbcs x14, x14, x17 + sbcs x11, x11, x16 + sbc x12, x12, x15 + adds x3, x3, x11 + adcs x4, x4, x12 + adc x10, x10, xzr + add x2, x10, #0x1 + lsl x15, x2, #32 + sub x16, x15, x2 + adds x13, x13, x2 + adcs x14, x14, x16 + adcs x3, x3, xzr + adcs x4, x4, x15 + csetm x7, lo + adds x13, x13, x7 + and x16, x7, #0xffffffff00000000 + adcs x14, x14, x16 + adcs x3, x3, x7 + and x15, x7, #0xfffffffeffffffff + adc x4, x4, x15 + stp x13, x14, [sp, #0x60] + stp x3, x4, [sp, #0x70] + ldp x5, x6, [sp, #0x40] + ldp x4, x3, [sp, #0x20] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x50] + ldp x4, x3, [sp, #0x30] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [x19, #0x40] + stp x7, x8, [x19, #0x50] + ldp x1, x2, [sp, #0x80] + lsl x0, x1, #2 + ldp x6, x7, [sp, #0xa0] + subs x0, x0, x6 + extr x1, x2, x1, #0x3e + sbcs x1, x1, x7 + ldp x3, x4, [sp, #0x90] + extr x2, x3, x2, #0x3e + ldp x6, x7, [sp, #0xb0] + sbcs x2, x2, x6 + extr x3, x4, x3, #0x3e + sbcs x3, x3, x7 + lsr x4, x4, #62 + sbc x4, x4, xzr + add x4, x4, #0x1 + lsl x5, x4, #32 + sub x6, x5, x4 + adds x0, x0, x4 + adcs x1, x1, x6 + adcs x2, x2, xzr + adcs x3, x3, x5 + csetm x4, lo + adds x0, x0, x4 + and x6, x4, #0xffffffff00000000 + adcs x1, x1, x6 + adcs x2, x2, x4 + and x5, x4, #0xfffffffeffffffff + adc x3, x3, x5 + stp x0, x1, [x19] + stp x2, x3, [x19, #0x10] + mov x1, #0x8 + mov x2, #-0x1 + ldp x9, x10, [sp] + subs x9, x2, x9 + mov x3, #-0x100000000 + sbcs x10, x3, x10 + ldp x11, x12, [sp, #0x10] + sbcs x11, x2, x11 + mov x4, #-0x100000001 + sbc x12, x4, x12 + lsl x3, x9, #3 + extr x4, x10, x9, #0x3d + extr x5, x11, x10, #0x3d + extr x6, x12, x11, #0x3d + lsr x7, x12, #61 + mov x1, #0x3 + ldp x9, x10, [sp, #0x60] + mul x8, x9, x1 + umulh x9, x9, x1 + adds x3, x3, x8 + mul x8, x10, x1 + umulh x10, x10, x1 + adcs x4, x4, x8 + ldp x11, x12, [sp, #0x70] + mul x8, x11, x1 + umulh x11, x11, x1 + adcs x5, x5, x8 + mul x8, x12, x1 + umulh x12, x12, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x7, x7, #0x1 + lsl x8, x7, #32 + sub x9, x8, x7 + adds x3, x3, x7 + adcs x4, x4, x9 + adcs x5, x5, xzr + adcs x6, x6, x8 + csetm x7, lo + adds x3, x3, x7 + and x9, x7, #0xffffffff00000000 + adcs x4, x4, x9 + adcs x5, x5, x7 + and x8, x7, #0xfffffffeffffffff + adc x6, x6, x8 + stp x3, x4, [x19, #0x20] + stp x5, x6, [x19, #0x30] + ldp x19, x20, [sp, #0xc0] + add sp, sp, #0xd0 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjscalarmul_alt.S new file mode 100644 index 00000000000..01682044936 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/sm2/sm2_montjscalarmul_alt.S @@ -0,0 +1,3405 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery-Jacobian form scalar multiplication for GM/T 0003-2012 curve SM2 +// Input scalar[4], point[12]; output res[12] +// +// extern void sm2_montjscalarmul_alt +// (uint64_t res[static 12], +// uint64_t scalar[static 4], +// uint64_t point[static 12]); +// +// This function is a variant of its affine point version sm2_scalarmul_alt. +// Here, input and output points are assumed to be in Jacobian form with +// their coordinates in the Montgomery domain. Thus, if priming indicates +// Montgomery form, x' = (2^256 * x) mod p_sm2 etc., each point argument +// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when +// z' is nonzero or the point at infinity (group identity) if z' = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve SM2, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_sm2) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjscalarmul_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjscalarmul_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Safe copies of inputs (res lasts the whole code, point not so long) +// and additional values in variables, with some aliasing + +#define res x19 +#define sgn x20 +#define j x20 +#define point x21 + +// Intermediate variables on the stack. + +#define scalarb sp, #(0*NUMSIZE) +#define acc sp, #(1*NUMSIZE) +#define tabent sp, #(4*NUMSIZE) + +#define tab sp, #(7*NUMSIZE) + +#define NSPACE #(31*NUMSIZE) + +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmp x14, #(1*I) __LF \ + ldp x12, x13, [x15] __LF \ + csel x0, x12, x0, eq __LF \ + csel x1, x13, x1, eq __LF \ + ldp x12, x13, [x15, #16] __LF \ + csel x2, x12, x2, eq __LF \ + csel x3, x13, x3, eq __LF \ + ldp x12, x13, [x15, #32] __LF \ + csel x4, x12, x4, eq __LF \ + csel x5, x13, x5, eq __LF \ + ldp x12, x13, [x15, #48] __LF \ + csel x6, x12, x6, eq __LF \ + csel x7, x13, x7, eq __LF \ + ldp x12, x13, [x15, #64] __LF \ + csel x8, x12, x8, eq __LF \ + csel x9, x13, x9, eq __LF \ + ldp x12, x13, [x15, #80] __LF \ + csel x10, x12, x10, eq __LF \ + csel x11, x13, x11, eq __LF \ + add x15, x15, #96 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0 __LF \ + movk nn, n1, lsl #16 __LF \ + movk nn, n2, lsl #32 __LF \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(sm2_montjscalarmul_alt): + + stp x19, x20, [sp, #-16]! + stp x21, x30, [sp, #-16]! + sub sp, sp, NSPACE + +// Preserve the "res" and "point" input arguments. We load and process the +// scalar immediately so we don't bother preserving that input argument. +// Also, "point" is only needed early on and so its register gets re-used. + + mov res, x0 + mov point, x2 + +// Load the digits of group order n_sm2 = [x12;x13;x14;x15] + + movbig(x12, #0x53bb, #0xf409, #0x39d5, #0x4123) + movbig(x13, #0x7203, #0xdf6b, #0x21c6, #0x052b) + mov x14, #0xffffffffffffffff + mov x15, #0xfffffffeffffffff + +// First, reduce the input scalar mod n_sm2, i.e. conditionally subtract n_sm2 + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + + subs x6, x2, x12 + sbcs x7, x3, x13 + sbcs x8, x4, x14 + sbcs x9, x5, x15 + + csel x2, x2, x6, cc + csel x3, x3, x7, cc + csel x4, x4, x8, cc + csel x5, x5, x9, cc + +// Now if the top bit of the reduced scalar is set, negate it mod n_sm2, +// i.e. do n |-> n_sm2 - n. Remember the sign as "sgn" so we can +// correspondingly negate the point below. + + subs x6, x12, x2 + sbcs x7, x13, x3 + sbcs x8, x14, x4 + sbc x9, x15, x5 + + tst x5, #0x8000000000000000 + csel x2, x2, x6, eq + csel x3, x3, x7, eq + csel x4, x4, x8, eq + csel x5, x5, x9, eq + cset sgn, ne + +// In either case then add the recoding constant 0x08888...888 to allow +// signed digits. + + mov x6, 0x8888888888888888 + adds x2, x2, x6 + adcs x3, x3, x6 + bic x7, x6, #0xF000000000000000 + adcs x4, x4, x6 + adc x5, x5, x7 + + stp x2, x3, [scalarb] + stp x4, x5, [scalarb+16] + +// Set the tab[0] table entry to the input point = 1 * P, except +// that we negate it if the top bit of the scalar was set. This +// negation takes care over the y = 0 case to maintain all the +// coordinates < p_sm2 throughout, even though triples (x,y,z) +// with y = 0 can only represent a point on the curve when z = 0 +// and it represents the point at infinity regardless of x and y. + + ldp x0, x1, [point] + stp x0, x1, [tab] + ldp x2, x3, [point, #16] + stp x2, x3, [tab+16] + + ldp x4, x5, [point, #32] + ldp x6, x7, [point, #48] + + mov x0, #0xffffffffffffffff + subs x0, x0, x4 + mov x1, #0xffffffff00000000 + sbcs x1, x1, x5 + mov x2, #0xffffffffffffffff + sbcs x2, x2, x6 + mov x3, #0xfffffffeffffffff + sbc x3, x3, x7 + + orr x8, x4, x5 + orr x9, x6, x7 + orr x8, x8, x9 + cmp x8, xzr + ccmp sgn, xzr, #4, ne + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tab+32] + stp x6, x7, [tab+48] + + ldp x0, x1, [point, #64] + stp x0, x1, [tab+64] + ldp x2, x3, [point, #80] + stp x2, x3, [tab+80] + +// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P + + add x0, tab+96*1 + add x1, tab + bl sm2_montjscalarmul_alt_sm2_montjdouble + + add x0, tab+96*2 + add x1, tab+96*1 + add x2, tab + bl sm2_montjscalarmul_alt_sm2_montjadd + + add x0, tab+96*3 + add x1, tab+96*1 + bl sm2_montjscalarmul_alt_sm2_montjdouble + + add x0, tab+96*4 + add x1, tab+96*3 + add x2, tab + bl sm2_montjscalarmul_alt_sm2_montjadd + + add x0, tab+96*5 + add x1, tab+96*2 + bl sm2_montjscalarmul_alt_sm2_montjdouble + + add x0, tab+96*6 + add x1, tab+96*5 + add x2, tab + bl sm2_montjscalarmul_alt_sm2_montjadd + + add x0, tab+96*7 + add x1, tab+96*3 + bl sm2_montjscalarmul_alt_sm2_montjdouble + +// Initialize the accumulator as a table entry for top 4 bits (unrecoded) + + ldr x14, [scalarb+24] + lsr x14, x14, #60 + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + add x15, tab + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + + stp x0, x1, [acc] + stp x2, x3, [acc+16] + stp x4, x5, [acc+32] + stp x6, x7, [acc+48] + stp x8, x9, [acc+64] + stp x10, x11, [acc+80] + + mov j, #252 + +// Main loop over size-4 bitfields: double 4 times then add signed digit + +sm2_montjscalarmul_alt_mainloop: + sub j, j, #4 + + add x0, acc + add x1, acc + bl sm2_montjscalarmul_alt_sm2_montjdouble + + add x0, acc + add x1, acc + bl sm2_montjscalarmul_alt_sm2_montjdouble + + add x0, acc + add x1, acc + bl sm2_montjscalarmul_alt_sm2_montjdouble + + add x0, acc + add x1, acc + bl sm2_montjscalarmul_alt_sm2_montjdouble + + lsr x2, j, #6 + ldr x14, [sp, x2, lsl #3] // Exploits scalarb = sp exactly + lsr x14, x14, j + and x14, x14, #15 + + subs x14, x14, #8 + cset x16, lo // x16 = sign of digit (1 = negative) + cneg x14, x14, lo // x14 = absolute value of digit + +// Conditionally select the table entry tab[i-1] = i * P in constant time + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + add x15, tab + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + +// Store it to "tabent" with the y coordinate optionally negated +// Again, do it carefully to give coordinates < p_sm2 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + + mov x0, #0xffffffffffffffff + subs x0, x0, x4 + mov x1, #0xffffffff00000000 + sbcs x1, x1, x5 + mov x2, #0xffffffffffffffff + sbcs x2, x2, x6 + mov x3, #0xfffffffeffffffff + sbc x3, x3, x7 + + orr x12, x4, x5 + orr x13, x6, x7 + orr x12, x12, x13 + cmp x12, xzr + ccmp x16, xzr, #4, ne + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + stp x8, x9, [tabent+64] + stp x10, x11, [tabent+80] + + add x0, acc + add x1, acc + add x2, tabent + bl sm2_montjscalarmul_alt_sm2_montjadd + + cbnz j, sm2_montjscalarmul_alt_mainloop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + ldp x0, x1, [acc] + stp x0, x1, [res] + ldp x0, x1, [acc+16] + stp x0, x1, [res, #16] + ldp x0, x1, [acc+32] + stp x0, x1, [res, #32] + ldp x0, x1, [acc+48] + stp x0, x1, [res, #48] + ldp x0, x1, [acc+64] + stp x0, x1, [res, #64] + ldp x0, x1, [acc+80] + stp x0, x1, [res, #80] + +// Restore stack and registers and return + + add sp, sp, NSPACE + ldp x21, x30, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// Local copies of subroutines, complete clones at the moment + +sm2_montjscalarmul_alt_sm2_montjadd: + sub sp, sp, #0xe0 + mov x15, x0 + mov x16, x1 + mov x17, x2 + ldp x2, x3, [x16, #0x40] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x16, #0x50] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, hs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + lsl x4, x8, #32 + lsr x5, x8, #32 + subs x2, x4, x8 + sbc x3, x5, xzr + subs x9, x9, x2 + sbcs x10, x10, x3 + sbcs x11, x11, x4 + sbc x8, x8, x5 + lsl x4, x9, #32 + lsr x5, x9, #32 + subs x2, x4, x9 + sbc x3, x5, xzr + subs x10, x10, x2 + sbcs x11, x11, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + lsl x4, x10, #32 + lsr x5, x10, #32 + subs x2, x4, x10 + sbc x3, x5, xzr + subs x11, x11, x2 + sbcs x8, x8, x3 + sbcs x9, x9, x4 + sbc x10, x10, x5 + lsl x4, x11, #32 + lsr x5, x11, #32 + subs x2, x4, x11 + sbc x3, x5, xzr + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, x4 + sbc x11, x11, x5 + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + csetm x2, hs + subs x8, x8, x2 + and x3, x2, #0xffffffff00000000 + sbcs x9, x9, x3 + and x5, x2, #0xfffffffeffffffff + sbcs x10, x10, x2 + sbc x11, x11, x5 + stp x8, x9, [sp] + stp x10, x11, [sp, #0x10] + ldp x2, x3, [x17, #0x40] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x17, #0x50] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, hs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + lsl x4, x8, #32 + lsr x5, x8, #32 + subs x2, x4, x8 + sbc x3, x5, xzr + subs x9, x9, x2 + sbcs x10, x10, x3 + sbcs x11, x11, x4 + sbc x8, x8, x5 + lsl x4, x9, #32 + lsr x5, x9, #32 + subs x2, x4, x9 + sbc x3, x5, xzr + subs x10, x10, x2 + sbcs x11, x11, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + lsl x4, x10, #32 + lsr x5, x10, #32 + subs x2, x4, x10 + sbc x3, x5, xzr + subs x11, x11, x2 + sbcs x8, x8, x3 + sbcs x9, x9, x4 + sbc x10, x10, x5 + lsl x4, x11, #32 + lsr x5, x11, #32 + subs x2, x4, x11 + sbc x3, x5, xzr + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, x4 + sbc x11, x11, x5 + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + csetm x2, hs + subs x8, x8, x2 + and x3, x2, #0xffffffff00000000 + sbcs x9, x9, x3 + and x5, x2, #0xfffffffeffffffff + sbcs x10, x10, x2 + sbc x11, x11, x5 + stp x8, x9, [sp, #0xa0] + stp x10, x11, [sp, #0xb0] + ldp x3, x4, [x17, #0x40] + ldp x7, x8, [x16, #0x20] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #0x30] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [x17, #0x50] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0xc0] + stp x14, x0, [sp, #0xd0] + ldp x3, x4, [x16, #0x40] + ldp x7, x8, [x17, #0x20] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #0x30] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [x16, #0x50] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0x20] + stp x14, x0, [sp, #0x30] + ldp x3, x4, [sp] + ldp x7, x8, [x17] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #0x10] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #0x10] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0x40] + stp x14, x0, [sp, #0x50] + ldp x3, x4, [sp, #0xa0] + ldp x7, x8, [x16] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #0x10] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #0xb0] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0x80] + stp x14, x0, [sp, #0x90] + ldp x3, x4, [sp] + ldp x7, x8, [sp, #0x20] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #0x30] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #0x10] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0x20] + stp x14, x0, [sp, #0x30] + ldp x3, x4, [sp, #0xa0] + ldp x7, x8, [sp, #0xc0] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #0xd0] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #0xb0] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0xc0] + stp x14, x0, [sp, #0xd0] + ldp x5, x6, [sp, #0x40] + ldp x4, x3, [sp, #0x80] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x50] + ldp x4, x3, [sp, #0x90] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0xa0] + stp x7, x8, [sp, #0xb0] + ldp x5, x6, [sp, #0x20] + ldp x4, x3, [sp, #0xc0] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x30] + ldp x4, x3, [sp, #0xd0] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0x20] + stp x7, x8, [sp, #0x30] + ldp x2, x3, [sp, #0xa0] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #0xb0] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, hs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + lsl x4, x8, #32 + lsr x5, x8, #32 + subs x2, x4, x8 + sbc x3, x5, xzr + subs x9, x9, x2 + sbcs x10, x10, x3 + sbcs x11, x11, x4 + sbc x8, x8, x5 + lsl x4, x9, #32 + lsr x5, x9, #32 + subs x2, x4, x9 + sbc x3, x5, xzr + subs x10, x10, x2 + sbcs x11, x11, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + lsl x4, x10, #32 + lsr x5, x10, #32 + subs x2, x4, x10 + sbc x3, x5, xzr + subs x11, x11, x2 + sbcs x8, x8, x3 + sbcs x9, x9, x4 + sbc x10, x10, x5 + lsl x4, x11, #32 + lsr x5, x11, #32 + subs x2, x4, x11 + sbc x3, x5, xzr + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, x4 + sbc x11, x11, x5 + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + csetm x2, hs + subs x8, x8, x2 + and x3, x2, #0xffffffff00000000 + sbcs x9, x9, x3 + and x5, x2, #0xfffffffeffffffff + sbcs x10, x10, x2 + sbc x11, x11, x5 + stp x8, x9, [sp, #0x60] + stp x10, x11, [sp, #0x70] + ldp x2, x3, [sp, #0x20] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #0x30] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, hs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + lsl x4, x8, #32 + lsr x5, x8, #32 + subs x2, x4, x8 + sbc x3, x5, xzr + subs x9, x9, x2 + sbcs x10, x10, x3 + sbcs x11, x11, x4 + sbc x8, x8, x5 + lsl x4, x9, #32 + lsr x5, x9, #32 + subs x2, x4, x9 + sbc x3, x5, xzr + subs x10, x10, x2 + sbcs x11, x11, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + lsl x4, x10, #32 + lsr x5, x10, #32 + subs x2, x4, x10 + sbc x3, x5, xzr + subs x11, x11, x2 + sbcs x8, x8, x3 + sbcs x9, x9, x4 + sbc x10, x10, x5 + lsl x4, x11, #32 + lsr x5, x11, #32 + subs x2, x4, x11 + sbc x3, x5, xzr + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, x4 + sbc x11, x11, x5 + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, hs + mov x3, #-0x100000000 + mov x5, #-0x100000001 + adds x12, x8, #0x1 + sbcs x13, x9, x3 + adcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, lo + csel x9, x9, x13, lo + csel x10, x10, x14, lo + csel x11, x11, x7, lo + stp x8, x9, [sp] + stp x10, x11, [sp, #0x10] + ldp x3, x4, [sp, #0x60] + ldp x7, x8, [sp, #0x80] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #0x90] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #0x70] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0x80] + stp x14, x0, [sp, #0x90] + ldp x3, x4, [sp, #0x60] + ldp x7, x8, [sp, #0x40] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #0x50] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #0x70] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0x40] + stp x14, x0, [sp, #0x50] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #0x80] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x10] + ldp x4, x3, [sp, #0x90] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #0x10] + ldp x5, x6, [sp, #0x40] + ldp x4, x3, [sp, #0x80] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x50] + ldp x4, x3, [sp, #0x90] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0x60] + stp x7, x8, [sp, #0x70] + ldp x3, x4, [sp, #0xa0] + ldp x7, x8, [x16, #0x40] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #0x50] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #0xb0] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0xa0] + stp x14, x0, [sp, #0xb0] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #0x40] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x10] + ldp x4, x3, [sp, #0x50] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #0x10] + ldp x5, x6, [sp, #0x80] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x90] + ldp x4, x3, [sp, #0x10] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0x80] + stp x7, x8, [sp, #0x90] + ldp x3, x4, [sp, #0x60] + ldp x7, x8, [sp, #0xc0] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #0xd0] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #0x70] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0x60] + stp x14, x0, [sp, #0x70] + ldp x3, x4, [sp, #0xa0] + ldp x7, x8, [x17, #0x40] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #0x50] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #0xb0] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0xa0] + stp x14, x0, [sp, #0xb0] + ldp x3, x4, [sp, #0x20] + ldp x7, x8, [sp, #0x80] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #0x90] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #0x30] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0x80] + stp x14, x0, [sp, #0x90] + ldp x5, x6, [sp, #0x80] + ldp x4, x3, [sp, #0x60] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x90] + ldp x4, x3, [sp, #0x70] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0x80] + stp x7, x8, [sp, #0x90] + ldp x0, x1, [x16, #0x40] + ldp x2, x3, [x16, #0x50] + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne + ldp x4, x5, [x17, #0x40] + ldp x6, x7, [x17, #0x50] + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne + cmp x13, x12 + ldp x8, x9, [sp, #0xa0] + csel x8, x0, x8, lo + csel x9, x1, x9, lo + csel x8, x4, x8, hi + csel x9, x5, x9, hi + ldp x10, x11, [sp, #0xb0] + csel x10, x2, x10, lo + csel x11, x3, x11, lo + csel x10, x6, x10, hi + csel x11, x7, x11, hi + ldp x12, x13, [x16] + ldp x0, x1, [sp] + csel x0, x12, x0, lo + csel x1, x13, x1, lo + ldp x12, x13, [x17] + csel x0, x12, x0, hi + csel x1, x13, x1, hi + ldp x12, x13, [x16, #0x10] + ldp x2, x3, [sp, #0x10] + csel x2, x12, x2, lo + csel x3, x13, x3, lo + ldp x12, x13, [x17, #0x10] + csel x2, x12, x2, hi + csel x3, x13, x3, hi + ldp x12, x13, [x16, #0x20] + ldp x4, x5, [sp, #0x80] + csel x4, x12, x4, lo + csel x5, x13, x5, lo + ldp x12, x13, [x17, #0x20] + csel x4, x12, x4, hi + csel x5, x13, x5, hi + ldp x12, x13, [x16, #0x30] + ldp x6, x7, [sp, #0x90] + csel x6, x12, x6, lo + csel x7, x13, x7, lo + ldp x12, x13, [x17, #0x30] + csel x6, x12, x6, hi + csel x7, x13, x7, hi + stp x0, x1, [x15] + stp x2, x3, [x15, #0x10] + stp x4, x5, [x15, #0x20] + stp x6, x7, [x15, #0x30] + stp x8, x9, [x15, #0x40] + stp x10, x11, [x15, #0x50] + add sp, sp, #0xe0 + ret + +sm2_montjscalarmul_alt_sm2_montjdouble: + sub sp, sp, #0xc0 + mov x15, x0 + mov x16, x1 + ldp x2, x3, [x16, #0x40] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x16, #0x50] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, hs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + lsl x4, x8, #32 + lsr x5, x8, #32 + subs x2, x4, x8 + sbc x3, x5, xzr + subs x9, x9, x2 + sbcs x10, x10, x3 + sbcs x11, x11, x4 + sbc x8, x8, x5 + lsl x4, x9, #32 + lsr x5, x9, #32 + subs x2, x4, x9 + sbc x3, x5, xzr + subs x10, x10, x2 + sbcs x11, x11, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + lsl x4, x10, #32 + lsr x5, x10, #32 + subs x2, x4, x10 + sbc x3, x5, xzr + subs x11, x11, x2 + sbcs x8, x8, x3 + sbcs x9, x9, x4 + sbc x10, x10, x5 + lsl x4, x11, #32 + lsr x5, x11, #32 + subs x2, x4, x11 + sbc x3, x5, xzr + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, x4 + sbc x11, x11, x5 + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, hs + mov x3, #-0x100000000 + mov x5, #-0x100000001 + adds x12, x8, #0x1 + sbcs x13, x9, x3 + adcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, lo + csel x9, x9, x13, lo + csel x10, x10, x14, lo + csel x11, x11, x7, lo + stp x8, x9, [sp] + stp x10, x11, [sp, #0x10] + ldp x2, x3, [x16, #0x20] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x16, #0x30] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, hs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + lsl x4, x8, #32 + lsr x5, x8, #32 + subs x2, x4, x8 + sbc x3, x5, xzr + subs x9, x9, x2 + sbcs x10, x10, x3 + sbcs x11, x11, x4 + sbc x8, x8, x5 + lsl x4, x9, #32 + lsr x5, x9, #32 + subs x2, x4, x9 + sbc x3, x5, xzr + subs x10, x10, x2 + sbcs x11, x11, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + lsl x4, x10, #32 + lsr x5, x10, #32 + subs x2, x4, x10 + sbc x3, x5, xzr + subs x11, x11, x2 + sbcs x8, x8, x3 + sbcs x9, x9, x4 + sbc x10, x10, x5 + lsl x4, x11, #32 + lsr x5, x11, #32 + subs x2, x4, x11 + sbc x3, x5, xzr + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, x4 + sbc x11, x11, x5 + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, hs + mov x3, #-0x100000000 + mov x5, #-0x100000001 + adds x12, x8, #0x1 + sbcs x13, x9, x3 + adcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, lo + csel x9, x9, x13, lo + csel x10, x10, x14, lo + csel x11, x11, x7, lo + stp x8, x9, [sp, #0x20] + stp x10, x11, [sp, #0x30] + ldp x5, x6, [x16] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x16, #0x10] + ldp x4, x3, [sp, #0x10] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0x60] + stp x7, x8, [sp, #0x70] + ldp x4, x5, [x16] + ldp x8, x9, [sp] + adds x4, x4, x8 + adcs x5, x5, x9 + ldp x6, x7, [x16, #0x10] + ldp x10, x11, [sp, #0x10] + adcs x6, x6, x10 + adcs x7, x7, x11 + csetm x2, hs + subs x4, x4, x2 + and x3, x2, #0xffffffff00000000 + sbcs x5, x5, x3 + and x1, x2, #0xfffffffeffffffff + sbcs x6, x6, x2 + sbc x7, x7, x1 + stp x4, x5, [sp, #0x40] + stp x6, x7, [sp, #0x50] + ldp x3, x4, [sp, #0x40] + ldp x7, x8, [sp, #0x60] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #0x70] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #0x50] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0x60] + stp x14, x0, [sp, #0x70] + ldp x4, x5, [x16, #0x20] + ldp x8, x9, [x16, #0x40] + adds x4, x4, x8 + adcs x5, x5, x9 + ldp x6, x7, [x16, #0x30] + ldp x10, x11, [x16, #0x50] + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x3, xzr, xzr + adds x8, x4, #0x1 + mov x9, #-0x100000000 + sbcs x9, x5, x9 + adcs x10, x6, xzr + mov x11, #-0x100000001 + sbcs x11, x7, x11 + sbcs x3, x3, xzr + csel x4, x4, x8, lo + csel x5, x5, x9, lo + csel x6, x6, x10, lo + csel x7, x7, x11, lo + stp x4, x5, [sp, #0x40] + stp x6, x7, [sp, #0x50] + ldp x3, x4, [x16] + ldp x7, x8, [sp, #0x20] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #0x30] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [x16, #0x10] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0x80] + stp x14, x0, [sp, #0x90] + ldp x2, x3, [sp, #0x60] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #0x70] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, hs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + lsl x4, x8, #32 + lsr x5, x8, #32 + subs x2, x4, x8 + sbc x3, x5, xzr + subs x9, x9, x2 + sbcs x10, x10, x3 + sbcs x11, x11, x4 + sbc x8, x8, x5 + lsl x4, x9, #32 + lsr x5, x9, #32 + subs x2, x4, x9 + sbc x3, x5, xzr + subs x10, x10, x2 + sbcs x11, x11, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + lsl x4, x10, #32 + lsr x5, x10, #32 + subs x2, x4, x10 + sbc x3, x5, xzr + subs x11, x11, x2 + sbcs x8, x8, x3 + sbcs x9, x9, x4 + sbc x10, x10, x5 + lsl x4, x11, #32 + lsr x5, x11, #32 + subs x2, x4, x11 + sbc x3, x5, xzr + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, x4 + sbc x11, x11, x5 + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, hs + mov x3, #-0x100000000 + mov x5, #-0x100000001 + adds x12, x8, #0x1 + sbcs x13, x9, x3 + adcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, lo + csel x9, x9, x13, lo + csel x10, x10, x14, lo + csel x11, x11, x7, lo + stp x8, x9, [sp, #0xa0] + stp x10, x11, [sp, #0xb0] + ldp x2, x3, [sp, #0x40] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #0x50] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, hs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + lsl x4, x8, #32 + lsr x5, x8, #32 + subs x2, x4, x8 + sbc x3, x5, xzr + subs x9, x9, x2 + sbcs x10, x10, x3 + sbcs x11, x11, x4 + sbc x8, x8, x5 + lsl x4, x9, #32 + lsr x5, x9, #32 + subs x2, x4, x9 + sbc x3, x5, xzr + subs x10, x10, x2 + sbcs x11, x11, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + lsl x4, x10, #32 + lsr x5, x10, #32 + subs x2, x4, x10 + sbc x3, x5, xzr + subs x11, x11, x2 + sbcs x8, x8, x3 + sbcs x9, x9, x4 + sbc x10, x10, x5 + lsl x4, x11, #32 + lsr x5, x11, #32 + subs x2, x4, x11 + sbc x3, x5, xzr + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, x4 + sbc x11, x11, x5 + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, hs + mov x3, #-0x100000000 + mov x5, #-0x100000001 + adds x12, x8, #0x1 + sbcs x13, x9, x3 + adcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, lo + csel x9, x9, x13, lo + csel x10, x10, x14, lo + csel x11, x11, x7, lo + stp x8, x9, [sp, #0x40] + stp x10, x11, [sp, #0x50] + mov x1, #0x9 + mov x2, #-0x1 + ldp x9, x10, [sp, #0xa0] + subs x9, x2, x9 + mov x3, #-0x100000000 + sbcs x10, x3, x10 + ldp x11, x12, [sp, #0xb0] + sbcs x11, x2, x11 + mov x4, #-0x100000001 + sbc x12, x4, x12 + mul x3, x1, x9 + mul x4, x1, x10 + mul x5, x1, x11 + mul x6, x1, x12 + umulh x9, x1, x9 + umulh x10, x1, x10 + umulh x11, x1, x11 + umulh x7, x1, x12 + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, xzr + mov x1, #0xc + ldp x9, x10, [sp, #0x80] + mul x8, x9, x1 + umulh x9, x9, x1 + adds x3, x3, x8 + mul x8, x10, x1 + umulh x10, x10, x1 + adcs x4, x4, x8 + ldp x11, x12, [sp, #0x90] + mul x8, x11, x1 + umulh x11, x11, x1 + adcs x5, x5, x8 + mul x8, x12, x1 + umulh x12, x12, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x7, x7, #0x1 + lsl x8, x7, #32 + sub x9, x8, x7 + adds x3, x3, x7 + adcs x4, x4, x9 + adcs x5, x5, xzr + adcs x6, x6, x8 + csetm x7, lo + adds x3, x3, x7 + and x9, x7, #0xffffffff00000000 + adcs x4, x4, x9 + adcs x5, x5, x7 + and x8, x7, #0xfffffffeffffffff + adc x6, x6, x8 + stp x3, x4, [sp, #0xa0] + stp x5, x6, [sp, #0xb0] + ldp x5, x6, [sp, #0x40] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x50] + ldp x4, x3, [sp, #0x10] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [sp, #0x40] + stp x7, x8, [sp, #0x50] + ldp x2, x3, [sp, #0x20] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #0x30] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, hs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + lsl x4, x8, #32 + lsr x5, x8, #32 + subs x2, x4, x8 + sbc x3, x5, xzr + subs x9, x9, x2 + sbcs x10, x10, x3 + sbcs x11, x11, x4 + sbc x8, x8, x5 + lsl x4, x9, #32 + lsr x5, x9, #32 + subs x2, x4, x9 + sbc x3, x5, xzr + subs x10, x10, x2 + sbcs x11, x11, x3 + sbcs x8, x8, x4 + sbc x9, x9, x5 + lsl x4, x10, #32 + lsr x5, x10, #32 + subs x2, x4, x10 + sbc x3, x5, xzr + subs x11, x11, x2 + sbcs x8, x8, x3 + sbcs x9, x9, x4 + sbc x10, x10, x5 + lsl x4, x11, #32 + lsr x5, x11, #32 + subs x2, x4, x11 + sbc x3, x5, xzr + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, x4 + sbc x11, x11, x5 + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, hs + mov x3, #-0x100000000 + mov x5, #-0x100000001 + adds x12, x8, #0x1 + sbcs x13, x9, x3 + adcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, lo + csel x9, x9, x13, lo + csel x10, x10, x14, lo + csel x11, x11, x7, lo + stp x8, x9, [sp] + stp x10, x11, [sp, #0x10] + ldp x3, x4, [sp, #0xa0] + ldp x7, x8, [sp, #0x60] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #0x70] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #0xb0] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + lsl x11, x12, #32 + lsr x6, x12, #32 + subs x8, x11, x12 + sbc x7, x6, xzr + subs x13, x13, x8 + sbcs x14, x14, x7 + sbcs x0, x0, x11 + sbc x12, x12, x6 + lsl x11, x13, #32 + lsr x6, x13, #32 + subs x8, x11, x13 + sbc x7, x6, xzr + subs x14, x14, x8 + sbcs x0, x0, x7 + sbcs x12, x12, x11 + sbc x13, x13, x6 + lsl x11, x14, #32 + lsr x6, x14, #32 + subs x8, x11, x14 + sbc x7, x6, xzr + subs x0, x0, x8 + sbcs x12, x12, x7 + sbcs x13, x13, x11 + sbc x14, x14, x6 + lsl x11, x0, #32 + lsr x6, x0, #32 + subs x8, x11, x0 + sbc x7, x6, xzr + subs x12, x12, x8 + sbcs x13, x13, x7 + sbcs x14, x14, x11 + sbc x0, x0, x6 + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, hs + mov x11, #-0x100000000 + mov x6, #-0x100000001 + adds x1, x12, #0x1 + sbcs x3, x13, x11 + adcs x4, x14, xzr + sbcs x5, x0, x6 + sbcs xzr, x8, xzr + csel x12, x12, x1, lo + csel x13, x13, x3, lo + csel x14, x14, x4, lo + csel x0, x0, x5, lo + stp x12, x13, [sp, #0x60] + stp x14, x0, [sp, #0x70] + ldp x5, x6, [sp, #0x40] + ldp x4, x3, [sp, #0x20] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #0x50] + ldp x4, x3, [sp, #0x30] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, lo + adds x5, x5, x3 + and x4, x3, #0xffffffff00000000 + adcs x6, x6, x4 + adcs x7, x7, x3 + and x4, x3, #0xfffffffeffffffff + adc x8, x8, x4 + stp x5, x6, [x15, #0x40] + stp x7, x8, [x15, #0x50] + ldp x1, x2, [sp, #0x80] + lsl x0, x1, #2 + ldp x6, x7, [sp, #0xa0] + subs x0, x0, x6 + extr x1, x2, x1, #0x3e + sbcs x1, x1, x7 + ldp x3, x4, [sp, #0x90] + extr x2, x3, x2, #0x3e + ldp x6, x7, [sp, #0xb0] + sbcs x2, x2, x6 + extr x3, x4, x3, #0x3e + sbcs x3, x3, x7 + lsr x4, x4, #62 + sbc x4, x4, xzr + add x4, x4, #0x1 + lsl x5, x4, #32 + sub x6, x5, x4 + adds x0, x0, x4 + adcs x1, x1, x6 + adcs x2, x2, xzr + adcs x3, x3, x5 + csetm x4, lo + adds x0, x0, x4 + and x6, x4, #0xffffffff00000000 + adcs x1, x1, x6 + adcs x2, x2, x4 + and x5, x4, #0xfffffffeffffffff + adc x3, x3, x5 + stp x0, x1, [x15] + stp x2, x3, [x15, #0x10] + mov x1, #0x8 + mov x2, #-0x1 + ldp x9, x10, [sp] + subs x9, x2, x9 + mov x3, #-0x100000000 + sbcs x10, x3, x10 + ldp x11, x12, [sp, #0x10] + sbcs x11, x2, x11 + mov x4, #-0x100000001 + sbc x12, x4, x12 + lsl x3, x9, #3 + extr x4, x10, x9, #0x3d + extr x5, x11, x10, #0x3d + extr x6, x12, x11, #0x3d + lsr x7, x12, #61 + mov x1, #0x3 + ldp x9, x10, [sp, #0x60] + mul x8, x9, x1 + umulh x9, x9, x1 + adds x3, x3, x8 + mul x8, x10, x1 + umulh x10, x10, x1 + adcs x4, x4, x8 + ldp x11, x12, [sp, #0x70] + mul x8, x11, x1 + umulh x11, x11, x1 + adcs x5, x5, x8 + mul x8, x12, x1 + umulh x12, x12, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x7, x7, #0x1 + lsl x8, x7, #32 + sub x9, x8, x7 + adds x3, x3, x7 + adcs x4, x4, x9 + adcs x5, x5, xzr + adcs x6, x6, x8 + csetm x7, lo + adds x3, x3, x7 + and x9, x7, #0xffffffff00000000 + adcs x4, x4, x9 + adcs x5, x5, x7 + and x8, x7, #0xfffffffeffffffff + adc x6, x6, x8 + stp x3, x4, [x15, #0x20] + stp x5, x6, [x15, #0x30] + add sp, sp, #0xc0 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/README.md b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/README.md new file mode 100644 index 00000000000..23697bc3b7c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/README.md @@ -0,0 +1,23 @@ +# Tutorials for s2n-bignum + +This directory includes examples for verifying Arm programs using s2n-bignum +and HOL Light. +To verify programs in x86, see `x86/tutorial`. + +### Unary reasoning + +1. `simple.ml`: Verifying a simple arithmetic property of a linear program. +2. `sequence.ml`: Verifying a program by splitting into smaller chunks. +3. `branch.ml`: Verifying a program that has a conditional branch. +4. `memory.ml`: Verifying a program that manipulates a memory. +5. `loop.ml`: Verifying a program that has a simple loop. +6. `bignum.ml`: Writing a specification of a program dealing with big numbers & proving it. +7. `rodata.ml`: Reading data from the read-only section. + +### Relational reasoning + +1. `rel_simp.ml`: Proving equivalence of two simple programs. +2. `rel_equivtac.ml`: Proving equivalence of two programs that have small differences. +3. `rel_reordertac.ml`: Proving equivalence of two programs where the second one has instructions reordered from that of the first one. +4. `rel_loop.ml`: Proving equivalence of two simple loops. +5. `rel_veceq.ml`: Proving equivalence of scalar vs. vectorized 128x128->256-bit squaring. diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/bignum.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/bignum.S new file mode 100644 index 00000000000..0d72c06c338 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/bignum.S @@ -0,0 +1,11 @@ + ldp x2, x3, [x0] + ldp x4, x5, [x1] + cmp x2, x4 + bne bb_false + cmp x3, x5 + bne bb_false + mov x0, #1 + ret +bb_false: + mov x0, xzr + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/bignum.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/bignum.ml new file mode 100644 index 00000000000..a445b6d76ef --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/bignum.ml @@ -0,0 +1,159 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + An example that shows how to describe big numbers in a specification. +******************************************************************************) + +needs "arm/proofs/base.ml";; + +(* Let's prove that the following program + + 0: a9400c02 ldp x2, x3, [x0] + 4: a9401424 ldp x4, x5, [x1] + 8: eb04005f cmp x2, x4 + c: 540000a1 b.ne 20 // b.any + 10: eb05007f cmp x3, x5 + 14: 54000061 b.ne 20 // b.any + 18: d2800020 mov x0, #0x1 + 1c: d65f03c0 ret + +0000000000000020 : + 20: aa1f03e0 mov x0, xzr + 24: d65f03c0 ret + + .. returns 1 to x0 if a pair of 16-byte integers at buffer x0 and x1 + are equal, 0 otherwise. + Since this example uses 128 bit integers, we will use 'bignum_from_memory' + which will state that reading a memory buffer of a specified word number will + return some large natural number. +*) +let bignum_mc = define_assert_from_elf "bignum_mc" "arm/tutorial/bignum.o" [ + 0xa9400c02; (* arm_LDP X2 X3 X0 (Immediate_Offset (iword (&0))) *) + 0xa9401424; (* arm_LDP X4 X5 X1 (Immediate_Offset (iword (&0))) *) + 0xeb04005f; (* arm_CMP X2 X4 *) + 0x540000a1; (* arm_BNE (word 20) *) + 0xeb05007f; (* arm_CMP X3 X5 *) + 0x54000061; (* arm_BNE (word 12) *) + 0xd2800020; (* arm_MOV X0 (rvalue (word 1)) *) + 0xd65f03c0; (* arm_RET X30 *) + 0xaa1f03e0; (* arm_MOV X0 XZR *) + 0xd65f03c0 (* arm_RET X30 *) +];; + +(* +You can get the above OCaml list data structure from +`print_literal_from_elf "<.o file>"` or +`save_literal_from_elf "" "<.o file>"`. +*) + +(* ARM_MK_EXEC_RULE decodes the byte sequence into conjunction of + equalities between the bytes and instructions. *) +let EXEC = ARM_MK_EXEC_RULE bignum_mc;; + +let BIGNUM_SPEC = prove( + `forall pc retpc loc0 loc1 a b. + ensures arm + // Precondition + (\s. aligned_bytes_loaded s (word pc) bignum_mc /\ + read PC s = word pc /\ + read X30 s = word retpc /\ + read X0 s = word loc0 /\ + read X1 s = word loc1 /\ + // Read 2 words (=128bits) at loc0. It is equivalent to num a. + // Alternatively, this kind of condition can be written using + // bignum_of_wordlist which takes a list of 64-bit words. + bignum_from_memory (word loc0,2) s = a /\ + // Read 2 words (=128bits) at loc1. It is equivalent to num b. + bignum_from_memory (word loc1,2) s = b + ) + // Postcondition + (\s. read PC s = word retpc /\ + read X0 s = word (if a = b then 1 else 0)) + // Registers (and memory locations) that may change after execution + (MAYCHANGE [PC;X0;X2;X3;X4;X5] ,, MAYCHANGE SOME_FLAGS ,, + MAYCHANGE [events])`, + + REPEAT STRIP_TAC THEN + (* Convert 'bignum_from_memory' into 'memory :> bytes (..)'. + Also, expand SOME_FLAGS *) + REWRITE_TAC[BIGNUM_FROM_MEMORY_BYTES;SOME_FLAGS] THEN + (* Start symbolic execution with state 's0' *) + ENSURES_INIT_TAC "s0" THEN + (* Split the memory :> bytes .. into a pair of memory :> bytes64. + This is necessary to successfully encode the symbolic result of ldps. *) + BIGNUM_DIGITIZE_TAC "a_" `read (memory :> bytes (word loc0,8 * 2)) s0` THEN + BIGNUM_DIGITIZE_TAC "b_" `read (memory :> bytes (word loc1,8 * 2)) s0` THEN + + (* Symbolically run two ldp instructions *) + ARM_STEPS_TAC EXEC (1--2) THEN + (* Until first 'bne' *) + ARM_STEPS_TAC EXEC (3--4) THEN + + (* Recognize the if condition and create two subgoals . *) + FIRST_X_ASSUM MP_TAC THEN + COND_CASES_TAC THENL [ + (* The low 64 bits of a and b are different. *) + STRIP_TAC THEN + ARM_STEPS_TAC EXEC (5--6) THEN + (* Returned; Finalize symbolic execution. *) + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + (* From `~(val (word_sub a_0 b_0) = 0)` and `val a_0 + 2 EXP 64 * val a_1 = a`, + and `val b_0 + 2 EXP 64 * val b_1 = b`, + prove `~(a = b)`. *) + SUBGOAL_THEN `~(a:num = b)` (fun th -> REWRITE_TAC[th]) THEN + MAP_EVERY EXPAND_TAC ["a";"b"] THEN + (* VAL_WORD_SUB_EQ_0: |- !x y. val (word_sub x y) = 0 <=> val x = val y) *) + RULE_ASSUM_TAC (REWRITE_RULE [VAL_WORD_SUB_EQ_0]) THEN + (* EQ_DIVMOD: |- !p m n. m DIV p = n DIV p /\ m MOD p = n MOD p <=> m = n *) + ONCE_REWRITE_TAC[SPEC `2 EXP 64` (GSYM EQ_DIVMOD)] THEN + (* The first '.. DIV .. = .. DIV ..' part is irelevant. *) + MATCH_MP_TAC (TAUT (`~Q ==> ~(P /\ Q)`)) THEN + (* Simplfy! *) + SIMP_TAC[MOD_MULT_ADD;VAL_BOUND_64;ARITH_RULE`~(2 EXP 64 = 0)`] THEN + ASM_SIMP_TAC[MOD_LT;VAL_BOUND_64]; + + ALL_TAC + ] THEN + + (* The low 64 bits of a and b are equivalent. *) + (* Until the second 'bne' *) + STRIP_TAC THEN + ARM_STEPS_TAC EXEC (5--6) THEN + + (* Recognize the if condition and create two subgoals . *) + FIRST_X_ASSUM MP_TAC THEN + COND_CASES_TAC THENL [ + (* The high 64 bits of a and b are different. *) + STRIP_TAC THEN + ARM_STEPS_TAC EXEC (7--8) THEN + (* Returned; Finalize symbolic execution. *) + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + (* Proof pattern is similar to the first branch case *) + SUBGOAL_THEN `~(a:num = b)` (fun th -> REWRITE_TAC[th]) THEN + MAP_EVERY EXPAND_TAC ["a";"b"] THEN + (* VAL_WORD_SUB_EQ_0: |- !x y. val (word_sub x y) = 0 <=> val x = val y) *) + RULE_ASSUM_TAC (REWRITE_RULE [VAL_WORD_SUB_EQ_0]) THEN + (* EQ_DIVMOD: |- !p m n. m DIV p = n DIV p /\ m MOD p = n MOD p <=> m = n *) + ONCE_REWRITE_TAC[SPEC `2 EXP 64` (GSYM EQ_DIVMOD)] THEN + (* The second '.. MOD .. = .. MOD ..' part is irelevant. *) + MATCH_MP_TAC (TAUT (`~P ==> ~(P /\ Q)`)) THEN + (* Simplfy! *) + SIMP_TAC[DIV_MULT_ADD;VAL_BOUND_64;ARITH_RULE`~(2 EXP 64 = 0)`] THEN + ASM_SIMP_TAC[DIV_LT;VAL_BOUND_64;ADD_CLAUSES]; + + ALL_TAC + ] THEN + + (* Both limbs are equivalent! *) + STRIP_TAC THEN + ARM_STEPS_TAC EXEC (7--8) THEN + (* Try to prove the postcondition and frame as much as possible *) + ENSURES_FINAL_STATE_TAC THEN + (* Use ASM_REWRITE_TAC[] to rewrite the goal using equalities in assumptions. *) + ASM_REWRITE_TAC[] THEN + SUBGOAL_THEN `(a:num = b)` (fun th -> REWRITE_TAC[th]) THEN + RULE_ASSUM_TAC (REWRITE_RULE [VAL_WORD_SUB_EQ_0]) THEN + ASM_ARITH_TAC);; diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/branch.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/branch.S new file mode 100644 index 00000000000..52f8d1f0ade --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/branch.S @@ -0,0 +1,7 @@ + cmp x1, x2 + b.hi BB2 + mov x0, x2 + ret +BB2: + mov x0, x1 + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/branch.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/branch.ml new file mode 100644 index 00000000000..284cdfcc3f0 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/branch.ml @@ -0,0 +1,119 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + Prove a property of a simple program that has a conditional branch. +******************************************************************************) + +needs "arm/proofs/base.ml";; + +(* The following program + 0: eb02003f cmp x1, x2 + 4: 54000068 b.hi 10 + 8: aa0203e0 mov x0, x2 + c: d65f03c0 ret + +0000000000000010 : + 10: aa0103e0 mov x0, x1 + 14: d65f03c0 ret + + .. copies max(x1,x2) to x0 and returns to the caller. + Let's prove this property. +*) + +let branch_mc = new_definition `branch_mc = [ + word 0x3f; word 0x00; word 0x02; word 0xeb; // cmp x1, x2 + word 0x68; word 0x00; word 0x00; word 0x54; // b.hi 10 + + word 0xe0; word 0x03; word 0x02; word 0xaa; // mov x0, x2 + word 0xc0; word 0x03; word 0x5f; word 0xd6; // ret + + // BB2: + word 0xe0; word 0x03; word 0x01; word 0xaa; // mov x0, x1 + word 0xc0; word 0x03; word 0x5f; word 0xd6 // ret + ]:((8)word)list`;; + +let EXEC = ARM_MK_EXEC_RULE branch_mc;; + +let branch_SPEC = prove( + `forall pc pcret a b. + ensures arm + // Precondition + (\s. aligned_bytes_loaded s (word pc) branch_mc /\ + read X30 s = word pcret /\ + read PC s = word pc /\ + read X1 s = word a /\ + read X2 s = word b) + // Postcondition + (\s. read PC s = word pcret /\ + read X0 s = word_umax (word a) (word b)) + // Registers (and memory locations) that may change after execution. + // ',,' is composition of relations. + (MAYCHANGE [PC;X0] ,, MAYCHANGE SOME_FLAGS ,, + // Branch instructions raise observable microarchitectural events! + MAYCHANGE [events])`, + (* Strips the outermost universal quantifier from the conclusion of a goal *) + REPEAT STRIP_TAC THEN + (* ENSURES_FINAL_STATE_TAC does not understand SOME_FLAGS in MAYCHANGE. Let's + unfold this in advance. *) + REWRITE_TAC [SOME_FLAGS] THEN + + (* Let's do symbolic execution until it hits the branch instruction. *) + ENSURES_INIT_TAC "s0" THEN + ARM_STEPS_TAC EXEC (1--2) THEN + + (* The PC has the following symbolic expression: + `read PC s2 = + (if val (word b) <= val (word a) /\ + ~(val (word_sub (word a) (word b)) = 0) + then word (pc + 16) + else word (pc + 8))` + Let's do case analysis on the condition of this if expression. + + First, move this assumption to the antecendent of the goal so the goal + becomes: + (read PC s2 = ...) ==> eventually arm ... + *) + FIRST_X_ASSUM MP_TAC THEN + + (* Recognize the if condition and create two subgoals . *) + COND_CASES_TAC THENL [ + (** Case 1: if the branch was taken! **) + (* Let's name the hypothesis first. *) + POP_ASSUM (LABEL_TAC "Hcond") THEN + DISCH_TAC THEN + + (* Do symbolic execution on the remaining two insts. *) + ARM_STEPS_TAC EXEC (3--4) THEN + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[] THEN + + (* The remaining goal is `word a = word (MAX a b).` *) + REMOVE_THEN "Hcond" MP_TAC THEN + (* WORD_UMAX: `!x y. word_umax x y = (if val x <= val y then y else x)` + VAL_WORD_SUB_EQ_0: `!x y. val (word_sub x y) = 0 <=> val x = val y` *) + REWRITE_TAC[WORD_UMAX;VAL_WORD_SUB_EQ_0] THEN + (* Let ARITH_TAC deal with reasoning on relational equations. *) + ARITH_TAC; + + + (** Case 2: if the branch was not taken! **) + (* Let's name the hypothesis first. *) + POP_ASSUM (LABEL_TAC "Hcond") THEN + DISCH_TAC THEN + + (* Do symbolic execution on the remaining two insts. *) + ARM_STEPS_TAC EXEC (3--4) THEN + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[] THEN + + (* The remaining goal is `word b = word (MAX a b).` *) + REMOVE_THEN "Hcond" MP_TAC THEN + (* WORD_UMAX: `!x y. word_umax x y = (if val x <= val y then y else x)` + VAL_WORD_SUB_EQ_0: `!x y. val (word_sub x y) = 0 <=> val x = val y` *) + REWRITE_TAC[WORD_UMAX;VAL_WORD_SUB_EQ_0] THEN + (* Let ARITH_TAC deal with reasoning on relational equations. *) + ARITH_TAC; + ]);; diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/loop.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/loop.S new file mode 100644 index 00000000000..5872b164679 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/loop.S @@ -0,0 +1,10 @@ + mov x1, xzr + mov x0, xzr + +loop: + add x1, x1, #1 + add x0, x0, #2 + cmp x1, #10 + bne loop + + ret diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/loop.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/loop.ml new file mode 100644 index 00000000000..0fb56c44678 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/loop.ml @@ -0,0 +1,123 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + Prove a property of a simple program that has a loop. +******************************************************************************) + +needs "arm/proofs/base.ml";; + +(* The following program + 0: aa1f03e1 mov x1, xzr + 4: aa1f03e0 mov x0, xzr + +0000000000000008 : + 8: 91000421 add x1, x1, #0x1 + c: 91000800 add x0, x0, #0x2 + 10: f100283f cmp x1, #0xa + 14: 54ffffa1 b.ne 8 // b.any + 18: d65f03c0 ret + + increments x0 until its value is 20. + Let's prove that this function returns 20. +*) +let loop_mc = new_definition `loop_mc = [ + word 0xe1; word 0x03; word 0x1f; word 0xaa; // mov x1, xzr + word 0xe0; word 0x03; word 0x1f; word 0xaa; // mov x0, xzr + +// loop: + word 0x21; word 0x04; word 0x00; word 0x91; // add x1, x1, #0x1 + word 0x00; word 0x08; word 0x00; word 0x91; // add x0, x0, #0x2 + word 0x3f; word 0x28; word 0x00; word 0xf1; // cmp x1, #0xa + word 0xa1; word 0xff; word 0xff; word 0x54; // b.ne 8 + word 0xc0; word 0x03; word 0x5f; word 0xd6 // ret +]:((8)word)list`;; + +let EXEC = ARM_MK_EXEC_RULE loop_mc;; + +let loop_SPEC = prove( + `forall pc retpc. + ensures arm + // Precondition + (\s. aligned_bytes_loaded s (word pc) loop_mc /\ + read PC s = word pc /\ + read X30 s = word retpc) + // Postcondition + (\s. read PC s = word retpc /\ + read X0 s = word 20) + // Registers (and memory locations) that may change after execution + (MAYCHANGE [PC;X0;X1] ,, MAYCHANGE SOME_FLAGS ,, + // Branch instructions raise observable microarchitectural events! + MAYCHANGE [events])`, + (* Unravel ARM flag registers! *) + REWRITE_TAC[SOME_FLAGS] THEN + REPEAT STRIP_TAC THEN + + (* ENSURES_WHILE_PAUP_TAC is one of several tactics for declaring a hoare triple of a loop. + PAUP means: + - "P": The loop ends with a flag-setting instruction such as 'cmp' or 'adds'. + 'read ZF s <=> i = 10' in the below statement relates the flag with + the loop counter. + - "A": The loop counter starts from variable 'a', In this tactic, this is 0. + Actually, when a = 0, you can also use ENSURES_WHILE_PUP_TAC. + - "UP": The counter goes up. *) + ENSURES_WHILE_PAUP_TAC + `0` (* counter begin number *) + `10` (* counter end number *) + `pc + 8` (* loop body start PC *) + `pc + 0x14` (* loop backedge branch PC *) + `\i s. // loop invariant at the end of the loop + (read X1 s = word i /\ read X0 s = word (i*2) /\ read X30 s = word retpc) /\ + // loop backedge condition + (read ZF s <=> i = 10)` THEN + REPEAT CONJ_TAC THENL [ + (* counter begin < counter end *) + ARITH_TAC; + + (* entrance to the loop *) + (* Let's use ARM_SIM_TAC which is ENSURES_INIT_TAC + ARM_STEPS_TAC + + ENSURES_FINAL_STATE_TAC + some post-processing. *) + ARM_SIM_TAC EXEC (1--2) THEN + CONV_TAC WORD_RULE; + + (* The loop body. let's prove this later. *) + (* If you are interactively exploring this proof, try `r 1;;`. *) + ALL_TAC; + + (* Prove that backedge is taken if i != 10. *) + REPEAT STRIP_TAC THEN + ARM_SIM_TAC EXEC [1]; + + (* Loop exit to the end of the program *) + ARM_SIM_TAC EXEC (1--2) THEN + (* word (10*2) = word 20 *) + CONV_TAC WORD_RULE + ] THEN + + (* The loop body *) + REPEAT STRIP_TAC THEN + ARM_SIM_TAC EXEC (1--3) THEN + REPEAT CONJ_TAC THENL [ + (* `word_add (word i) (word 1) = word (i + 1)` *) + CONV_TAC WORD_RULE; + + (* `word_add (word (i * 2)) (word 2) = word ((i + 1) * 2)` *) + CONV_TAC WORD_RULE; + + (* `val (word_add (word i) (word 18446744073709551607)) = 0 <=> i + 1 = 10` *) + (* This goal is slightly complicated to prove using automatic solvers. + Let's manually attack this. *) + (* Yes, we also have 'WORD_BLAST' that works like bit-blasting. *) + REWRITE_TAC [WORD_BLAST `word_add x (word 18446744073709551607):int64 = + word_sub x (word 9)`] THEN + REWRITE_TAC[VAL_WORD_SUB_EQ_0] THEN + REWRITE_TAC[VAL_WORD;DIMINDEX_64] THEN + (* Rewrite all '_ MOD 2 EXP 64' to '_' because they are known to be less + than 2 EXP 64. *) + IMP_REWRITE_TAC[MOD_LT; ARITH_RULE`9 < 2 EXP 64`] THEN + CONJ_TAC THEN (* will create two arithmetic subgoals. *) + (* both goals can be solved using ASM_ARITH_TAC. *) + ASM_ARITH_TAC + ]);; diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/memory.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/memory.S new file mode 100644 index 00000000000..dd340152473 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/memory.S @@ -0,0 +1,4 @@ +ldr x2, [x0] +ldr x3, [x1] +str x2, [x1] +str x3, [x0] diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/memory.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/memory.ml new file mode 100644 index 00000000000..3e3f3275295 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/memory.ml @@ -0,0 +1,76 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + Prove a property of a simple program that reads from and writes to + the memory. +******************************************************************************) + +needs "arm/proofs/base.ml";; + +(* The following program + 0: f9400002 ldr x2, [x0] + 4: f9400023 ldr x3, [x1] + 8: f9000022 str x2, [x1] + c: f9000003 str x3, [x0] + + .. swaps the two words at address x0 and x1, of x0 and x1 do not alias. + Let's prove this. +*) + +let memory_mc = new_definition `memory_mc = [ + word 0x02; word 0x00; word 0x40; word 0xf9; // ldr x2, [x0] + word 0x23; word 0x00; word 0x40; word 0xf9; // ldr x3, [x1] + word 0x22; word 0x00; word 0x00; word 0xf9; // str x2, [x1] + word 0x03; word 0x00; word 0x00; word 0xf9 // str x3, [x0] +]:((8)word)list`;; + +let EXEC = ARM_MK_EXEC_RULE memory_mc;; + +let memory_SPEC = prove( + `forall pc loc0 loc1 a b. + // Assume that loc0 (=x0) and loc1(=x1) do not overlap within 8 bytes. + nonoverlapping (word loc0:int64, 8) (word loc1:int64, 8) /\ + // .. and the writing locations do not overlap with the loaded program. + nonoverlapping (word loc0:int64, 8) (word pc:int64, LENGTH memory_mc) /\ + nonoverlapping (word loc1:int64, 8) (word pc:int64, LENGTH memory_mc) + ==> ensures arm + // Precondition + (\s. aligned_bytes_loaded s (word pc) memory_mc /\ + read PC s = word pc /\ + read X0 s = word loc0 /\ + read X1 s = word loc1 /\ + read (memory :> bytes64 (word loc0)) s = word a /\ + read (memory :> bytes64 (word loc1)) s = word b) + // Postcondition + (\s. read PC s = word (pc + 16) /\ + read (memory :> bytes64 (word loc0)) s = word b /\ + read (memory :> bytes64 (word loc1)) s = word a) + // Registers (and memory locations) that may change after execution. + // ',,' is composition of relations. + (MAYCHANGE [PC;X2;X3] ,, + // The memory locations may change. Record this. + MAYCHANGE [memory :> bytes64 (word loc0); memory :> bytes64 (word loc1)] ,, + // Memory instructions raise observable microarchitectural events! + MAYCHANGE [events])`, + + (* Convert 'nonoverlapping' into 'nonoverlapping_modulo' and rewrite 'LENGTH memory_mc' + with the concrete number. *) + REWRITE_TAC[NONOVERLAPPING_CLAUSES;fst EXEC] THEN + (* Strips the assumption and outermost universal quantifier from the conclusion of a goal *) + REPEAT STRIP_TAC THEN + + (* Let's do symbolic execution until it hits the branch instruction. *) + ENSURES_INIT_TAC "s0" THEN + ARM_STEPS_TAC EXEC (1--4) THEN + + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[]);; + +(* If the written nonoverlapping condition is not sufficient, existing assumptions + on memory loads may be erased after simulating store instructions. + To print which instructions are erased, set + components_print_log := true;; +*) diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac.S new file mode 100644 index 00000000000..d1bbc0c2766 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac.S @@ -0,0 +1,4 @@ +ldp x11, x10, [x0] +add x12, x10, #1 +mul x12, x11, x12 // x11 * (x10 + 1) +str x12, [x1] diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac.ml new file mode 100644 index 00000000000..ef48c23f5c3 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac.ml @@ -0,0 +1,198 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + An example that proves equivalence of two straight-line codes + accessing memory using EQUIV_STEPS_TAC. +******************************************************************************) + +(* Please copy this file to the root directory of + s2n-bignum, then follow the instructions. *) + +needs "arm/proofs/equiv.ml";; + +(* This example will define & prove the equivalence of two programs + using EQUIV_STEPS_TAC. + This tactic is useful if two programs are supposed to have many + equivalent parts. EQUIV_STEPS_TAC receives 'actions', which is an + OCaml list stating which lines are equivalent and which lines are diverging. + This 'actions' can be generated from, say, syntactic diff of + two assembly programs. s2n-bignum also has tools/gen-actions.py + which runs the `diff` linux tool on two assembly files. *) + +let mc = define_assert_from_elf "mc" "arm/tutorial/rel_equivtac.o" [ + 0xa940280b; (* arm_LDP X11 X10 X0 (Immediate_Offset (iword (&0))) *) + 0x9100054c; (* arm_ADD X12 X10 (rvalue (word 1)) *) + 0x9b0c7d6c; (* arm_MUL X12 X11 X12 *) + 0xf900002c (* arm_STR X12 X1 (Immediate_Offset (word 0)) *) +];; + +(* Note that the used registers are different between mc and mc2 + (X10,X11,X12 vs. X20,X21,X22). This is fine since EQUIV_STEPS_TAC + can smartly map differently used registers. *) +let mc2 = define_assert_from_elf "mc2" "arm/tutorial/rel_equivtac2.o" [ + 0xa9405015; (* arm_LDP X21 X20 X0 (Immediate_Offset (iword (&0))) *) + 0x9b147eb6; (* arm_MUL X22 X21 X20 *) + 0x8b1502d6; (* arm_ADD X22 X22 X21 *) + 0xf9000036 (* arm_STR X22 X1 (Immediate_Offset (word 0)) *) +];; + +let EXEC = ARM_MK_EXEC_RULE mc;; +let EXEC2 = ARM_MK_EXEC_RULE mc2;; + +(* Define the equality between the input states. *) +let eqin = new_definition + `forall s1 s1' inbuf outbuf. + (eqin:(armstate#armstate)->int64->int64->bool) (s1,s1') inbuf outbuf <=> + (// The values of buffer pointers, X0 and X1. + // Their values are symbolically defined as inbuf and outbuf. + // outbuf is also used for the nonoverlapping precondition between + // the output buffer and the program bytecode. + read X0 s1 = inbuf /\ + read X0 s1' = inbuf /\ + read X1 s1 = outbuf /\ + read X1 s1' = outbuf /\ + // The equal buffer contents at the input buffer. '2' stands for 2 words + // (and 1 word is 8 bytes, hence 2*8=16 bytes) + (exists n. + bignum_from_memory (inbuf,2) s1 = n /\ + bignum_from_memory (inbuf,2) s1' = n))`;; + +(* Define the equality between the output states. *) +let eqout = new_definition + `forall s1 s1' outbuf. + (eqout:(armstate#armstate)->int64->bool) (s1,s1') outbuf <=> + (read X1 s1 = outbuf /\ + read X1 s1' = outbuf /\ + (exists n. + bignum_from_memory (outbuf,1) s1 = n /\ + bignum_from_memory (outbuf,1) s1' = n))`;; + +(* Now, build the program equivalence statement using + 'mk_equiv_statement_simple'. + Its first argument states the assumption that will appear at + LHS of ' ==> ensures2 ..(equiv statement)..'. + + If it fails, please try `arm_print_log := true`. *) +let equiv_goal = mk_equiv_statement_simple + `ALL (nonoverlapping (outbuf,8)) [ + (word pc:int64, LENGTH mc); + (word pc2:int64, LENGTH mc2) + ]` + eqin (* Input state equivalence *) + eqout (* Output state equivalence *) + mc (* First program machine code *) + `MAYCHANGE [PC; X10; X11; X12] ,, MAYCHANGE [memory :> bytes (outbuf, 8)] ,, + MAYCHANGE [events]` + mc2 (* Second program machine code *) + `MAYCHANGE [PC; X20; X21; X22] ,, MAYCHANGE [memory :> bytes (outbuf, 8)] ,, + MAYCHANGE [events]`;; + +(* equiv_goal is: + `forall pc pc2 inbuf outbuf. + ALL (nonoverlapping (outbuf,8)) + [word pc,LENGTH mc; word pc2,LENGTH mc2] + ==> ensures2 arm + (\(s,s2). + aligned_bytes_loaded s (word pc) mc /\ + read PC s = word pc /\ + aligned_bytes_loaded s2 (word pc2) mc2 /\ + read PC s2 = word pc2 /\ + eqin (s,s2) inbuf outbuf) + (\(s,s2). + aligned_bytes_loaded s (word pc) mc /\ + read PC s = word (pc + 16) /\ + aligned_bytes_loaded s2 (word pc2) mc2 /\ + read PC s2 = word (pc2 + 16) /\ + eqout (s,s2) outbuf) + (\(s,s2) (s',s2'). + (MAYCHANGE [PC; X10; X11; X12] ,, + MAYCHANGE [memory :> bytes (outbuf,8)] ,, + MAYCHANGE [events]) + s + s' /\ + (MAYCHANGE [PC; X20; X21; X22] ,, + MAYCHANGE [memory :> bytes (outbuf,8)] ,, + MAYCHANGE [events]) + s2 + s2') + (\s. 4) + (\s. 4)` +*) + +(* Now, let's prove the program equivalence. *) +let EQUIV = prove(equiv_goal, + + (* Rewrite ALL, nonoverlapping, and LENGTH * *) + REWRITE_TAC[ALL;NONOVERLAPPING_CLAUSES; fst EXEC; fst EXEC2] THEN + REPEAT STRIP_TAC THEN + + (** Initialize **) + EQUIV_INITIATE_TAC eqin THEN + RULE_ASSUM_TAC (REWRITE_RULE[BIGNUM_FROM_MEMORY_BYTES]) THEN + + (* Do symbolic simulations on the two programs using EQUIV_STEPS_TAC. + As explained before, the action is an OCaml list. + Each item describes: + - ("equal",begin line number of program 1 (start from 0), + end line number of program 1 (not inclusive), + begin line number of program 2, + end line number of program 2) + : means that these instructions in program 1 and program 2 must + yield sysmbolically equivalent output. Therefore, EQUIV_STEPS_TAC + uses a lock-step simulation for these. + If the symbolic outputs of the matching instructions are not having + equal expression, it will print an error message. + Actually, it tries to solve a simple bit-vector equality such as + 'x * (y + 1) = x * y + x', + and can succeed. This is exactly the example case here. + - ("replace",beign line number of program 1, + end line number of program 1 (not inclusive), + begin line number of program 2, + end line number of program 2) + : means that these instructions in program 1 and 2 differ. + EQUIV_STEPS_TAC uses stuttering simulations for each program. + *) + EQUIV_STEPS_TAC [ + ("equal",0,1,0,1); + ("replace",1,3,1,3); + ("equal",3,4,3,4) + ] EXEC EXEC2 THEN + + REPEAT_N 2 ENSURES_N_FINAL_STATE_TAC THEN + (* Prove remaining clauses from the postcondition *) + ASM_REWRITE_TAC[] THEN + (* This tactic below is typically fixed and probably you will want to reuse. :) *) + CONJ_TAC THENL [ + (** SUBGOAL 1. Outputs **) + ASM_REWRITE_TAC[eqout; + BIGNUM_EXPAND_CONV `bignum_from_memory (outbuf,1) s`] THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + + (** SUBGOAL 2. Maychange pair **) + MONOTONE_MAYCHANGE_CONJ_TAC + ]);; + +(* + If the EQUIV_STEPS_TAC fails to prove that instructions that are supposed + to be equivalent according to actions are yielding equal output expressions, + it will print a message like this: + + ARM_LOCKSTEP_TAC (4,4) + Running left... + Running right... + 1 basis elements and 0 critical pairs + - Error: WORD_RULE could not prove + ` = ` + + If you are certain that these expressions must be equal, you can improve + `extra_word_CONV` of symbolic simulator by adding a custom word equation + to extra_word_CONV. + + ``` + let org_convs = !extra_word_CONV;; + extra_word_CONV := (GEN_REWRITE_CONV I [])::org_convs;; + ``` +*) diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac2.S new file mode 100644 index 00000000000..39f772cf54b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_equivtac2.S @@ -0,0 +1,4 @@ +ldp x21, x20, [x0] +mul x22, x21, x20 +add x22, x22, x21 // x21 * x20 + x21 +str x22, [x1] diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop.S new file mode 100644 index 00000000000..46aac446e30 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop.S @@ -0,0 +1,6 @@ +loop: +add x2, x2, #2 +add x0, x0, #1 +cmp x0, x1 +bne loop + diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop.ml new file mode 100644 index 00000000000..1c99d9b3afc --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop.ml @@ -0,0 +1,151 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + An example that proves equivalence of two loops. +******************************************************************************) + +needs "arm/proofs/equiv.ml";; + +(* Prove that these two loops are equivalent in the sense that the results of + two X2 are same. *) + +let loop_mc = define_assert_from_elf "loop_mc" "arm/tutorial/rel_loop.o" [ + 0x91000842; (* arm_ADD X2 X2 (rvalue (word 2)) *) + 0x91000400; (* arm_ADD X0 X0 (rvalue (word 1)) *) + 0xeb01001f; (* arm_CMP X0 X1 *) + 0x54ffffa1 (* arm_BNE (word 2097140) *) +];; + +let loop2_mc = define_assert_from_elf "loop2_mc" "arm/tutorial/rel_loop2.o" [ + 0x91000442; (* arm_ADD X2 X2 (rvalue (word 1)) *) + 0x91000442; (* arm_ADD X2 X2 (rvalue (word 1)) *) + 0x91000400; (* arm_ADD X0 X0 (rvalue (word 1)) *) + 0xeb01001f; (* arm_CMP X0 X1 *) + 0x54ffff81 (* arm_BNE (word 2097136) *) +];; + +let LOOP_EXEC = ARM_MK_EXEC_RULE loop_mc;; +let LOOP2_EXEC = ARM_MK_EXEC_RULE loop2_mc;; + +(* For relational reasoning, we use predicates and tactics that are slightly + different from those for unary reasoning. *) + +let LOOP_EQUIV = prove( + `forall pc1 pc2 n. + n > 0 /\ n < 2 EXP 64 ==> + // Relational hoare triple. + ensures2 arm + // Precondition + (\(s1,s2). aligned_bytes_loaded s1 (word pc1) loop_mc /\ + read PC s1 = word pc1 /\ + aligned_bytes_loaded s2 (word pc2) loop2_mc /\ + read PC s2 = word pc2 /\ + // X0 is the induction variable and X1 is n. + (read X0 s1 = word 0 /\ read X0 s2 = word 0 /\ + read X1 s1 = word n /\ read X1 s2 = word n /\ + // X2 must start equal. + (?k. read X2 s1 = k /\ read X2 s2 = k))) + // Postcondition + (\(s1,s2). aligned_bytes_loaded s1 (word pc1) loop_mc /\ + read PC s1 = word (pc1 + 12) /\ + aligned_bytes_loaded s2 (word pc2) loop2_mc /\ + read PC s2 = word (pc2 + 16) /\ + // They finish with an equal value. + (?k. read X2 s1 = k /\ read X2 s2 = k)) + // State components that may change. + (\(s1,s2) (s1',s2'). + (MAYCHANGE [PC;X0;X2] ,, MAYCHANGE SOME_FLAGS ,, MAYCHANGE [events]) s1 s1' /\ + (MAYCHANGE [PC;X0;X2] ,, MAYCHANGE SOME_FLAGS ,, MAYCHANGE [events]) s2 s2') + // The number of small steps of the 'left' program and 'right' program. + (\s. 4 * n - 1) (\s. 5 * n - 1)`, + + REPEAT STRIP_TAC THEN REWRITE_TAC[SOME_FLAGS] THEN + (* Look at the definition of ENSURES2_WHILE_PAUP_TAC in arm/proofs/equiv.ml + to understand the meanings of arguments. *) + ENSURES2_WHILE_PAUP_TAC `0:num` `n:num` `pc1:num` `pc1+12` `pc2:num` `pc2+16` + `\(i:num) s1 s2. + read X0 s1 = word i /\ read X0 s2 = word i /\ + read X1 s1 = word n /\ read X1 s2 = word n /\ + (?k. read X2 s1 = k /\ read X2 s2 = k)` + `\(i:num) s. read ZF s <=> (word i:int64) = word n` + `\(i:num) s. read ZF s <=> (word i:int64) = word n` + `\(i:num). 3` + `\(i:num). 4` + `0` `0` `0` `0` `1` `1` THEN + REPEAT CONJ_TAC THENL [ + (* # loop itrs > 0 *) + ASM_ARITH_TAC; + + (* pre *) + MATCH_MP_TAC ENSURES2_TRIVIAL THEN + REWRITE_TAC[FORALL_PAIR_THM] THEN + REPEAT GEN_TAC THEN + MONOTONE_MAYCHANGE_CONJ_TAC; + + (* now the main loop! *) + REPEAT STRIP_TAC THEN + (* Start symbolic execution of two programs. *) + ENSURES2_INIT_TAC "s0" "s0'" THEN + + FIRST_X_ASSUM (MP_TAC o (check (is_exists o concl))) THEN + STRIP_TAC THEN + REWRITE_TAC[GSYM CONJ_ASSOC] THEN + + (* Symbolically execute the left program only. *) + ARM_N_STUTTER_LEFT_TAC LOOP_EXEC (1--3) None THEN + (* Symbolically execute the right program only. *) + ARM_N_STUTTER_RIGHT_TAC LOOP2_EXEC (1--4) "'" None THEN + (* Let's prove the postcondition. *) + REPEAT_N 2 ENSURES_N_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[WORD_ADD] THEN + + CONJ_TAC THENL [ + CONJ_TAC THENL [ + META_EXISTS_TAC THEN UNIFY_REFL_TAC; + REWRITE_TAC[VAL_EQ_0] THEN CONV_TAC WORD_RULE; + ]; + + MONOTONE_MAYCHANGE_CONJ_TAC + ]; + + (* backedge *) + REPEAT STRIP_TAC THEN + ENSURES2_INIT_TAC "s0" "s0'" THEN + UNDISCH_TAC `?k. read X2 s0 = k /\ read X2 s0' = k` THEN + STRIP_TAC THEN + REWRITE_TAC[GSYM CONJ_ASSOC] THEN + + ARM_N_STUTTER_LEFT_TAC LOOP_EXEC (1--1) None THEN + ARM_N_STUTTER_RIGHT_TAC LOOP2_EXEC (1--1) "'" None THEN + REPEAT_N 2 ENSURES_N_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[WORD_ADD] THEN + + SUBGOAL_THEN `(word i:int64 = word n) <=> F` SUBST_ALL_TAC THENL [ + REWRITE_TAC[WORD_EQ;CONG;DIMINDEX_64] THEN + IMP_REWRITE_TAC[MOD_LT] THEN ASM_ARITH_TAC; + + ALL_TAC + ] THEN + REWRITE_TAC[] THEN + CONJ_TAC THENL [ + + META_EXISTS_TAC THEN UNIFY_REFL_TAC; + + MONOTONE_MAYCHANGE_CONJ_TAC + ]; + + (* postcond *) + MATCH_MP_TAC ENSURES2_TRIVIAL THEN + REWRITE_TAC[FORALL_PAIR_THM] THEN + CONJ_TAC THENL [MESON_TAC[]; ALL_TAC] THEN + REPEAT GEN_TAC THEN MONOTONE_MAYCHANGE_CONJ_TAC; + + (* counter 1 *) + REWRITE_TAC[NSUM_CONST_NUMSEG] THEN ASM_ARITH_TAC; + + (* counter 2 *) + REWRITE_TAC[NSUM_CONST_NUMSEG] THEN ASM_ARITH_TAC; + ]);; diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop2.S new file mode 100644 index 00000000000..60e78c5354d --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_loop2.S @@ -0,0 +1,7 @@ +loop: +add x2, x2, #1 +add x2, x2, #1 +add x0, x0, #1 +cmp x0, x1 +bne loop + diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac.S new file mode 100644 index 00000000000..ff10c42eef8 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac.S @@ -0,0 +1,6 @@ +ldr x10, [x0] +add x10, x10, #1 +str x10, [x1] +ldr x10, [x0, #8] +add x10, x10, #2 +str x10, [x1, #8] diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac.ml new file mode 100644 index 00000000000..84fcba1fbdb --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac.ml @@ -0,0 +1,187 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + An example that proves equivalence of two straight-line codes + whose instructions are shuffled. +******************************************************************************) + +(* Please copy this file to the root directory of + s2n-bignum, then follow the instructions. *) + +needs "arm/proofs/equiv.ml";; + +(* This example will define & prove the equivalence of two programs + whose instructions are reordered, using ARM_N_STEPS_AND_ABBREV_TAC and + ARM_N_STEPS_AND_REWRITE_TAC. + These tactics receive the mapping between the lines of instructions + of the two programs (which is an OCaml integer list). + ARM_N_STEPS_AND_ABBREV_TAC symbolically simulates the "left" program, + introduces abbreviations of the output symbolic expressions of each + instruction, and stores it to an OCaml reference variable. + ARM_N_STEPS_AND_REWRITE_TAC symbolically simulates the "right" program, + finds the right abbreviation expression according to the line-number + mapping information, and rewrites the output expressions using the matched + abbreviation. *) + +let mc = define_assert_from_elf "mc" "arm/tutorial/rel_reordertac.o" [ + 0xf940000a; (* arm_LDR X10 X0 (Immediate_Offset (word 0)) *) + 0x9100054a; (* arm_ADD X10 X10 (rvalue (word 1)) *) + 0xf900002a; (* arm_STR X10 X1 (Immediate_Offset (word 0)) *) + 0xf940040a; (* arm_LDR X10 X0 (Immediate_Offset (word 8)) *) + 0x9100094a; (* arm_ADD X10 X10 (rvalue (word 2)) *) + 0xf900042a (* arm_STR X10 X1 (Immediate_Offset (word 8)) *) +];; + +(* Note that the used registers are different between mc and mc2 + (X10 vs. X10,X11). This is fine since the tactics can smartly + map the registers. + Also, this reordering is correct only of [X0, X0+16) is disjoint with + [X1, X1+16). We will have this as an assumption in the equivalence + goal. *) +let mc2 = define_assert_from_elf "mc2" "arm/tutorial/rel_reordertac2.o" [ + 0xf940000a; (* arm_LDR X10 X0 (Immediate_Offset (word 0)) *) + 0xf940040b; (* arm_LDR X11 X0 (Immediate_Offset (word 8)) *) + 0x9100054a; (* arm_ADD X10 X10 (rvalue (word 1)) *) + 0x9100096b; (* arm_ADD X11 X11 (rvalue (word 2)) *) + 0xf900002a; (* arm_STR X10 X1 (Immediate_Offset (word 0)) *) + 0xf900042b (* arm_STR X11 X1 (Immediate_Offset (word 8)) *) +];; + +let EXEC = ARM_MK_EXEC_RULE mc;; +let EXEC2 = ARM_MK_EXEC_RULE mc2;; + +(* Define the equality between the input states. *) +let eqin = new_definition + `forall s1 s1' inbuf outbuf. + (eqin:(armstate#armstate)->int64->int64->bool) (s1,s1') inbuf outbuf <=> + (// The values of buffer pointers, X0 and X1. + // Their values are symbolically defined as inbuf and outbuf. + // outbuf is also used for the nonoverlapping precondition between + // the output buffer and the program bytecode. + read X0 s1 = inbuf /\ + read X0 s1' = inbuf /\ + read X1 s1 = outbuf /\ + read X1 s1' = outbuf /\ + // The equal buffer contents at the input buffer. '2' stands for 2 words + // (and 1 word is 8 bytes, hence 2*8=16 bytes) + (exists n. + bignum_from_memory (inbuf,2) s1 = n /\ + bignum_from_memory (inbuf,2) s1' = n))`;; + +(* Define the equality between the output states. *) +let eqout = new_definition + `forall s1 s1' inbuf outbuf. + (eqout:(armstate#armstate)->int64->int64->bool) (s1,s1') inbuf outbuf <=> + (read X0 s1 = inbuf /\ + read X0 s1' = inbuf /\ + read X1 s1 = outbuf /\ + read X1 s1' = outbuf /\ + (exists n. + bignum_from_memory (inbuf,2) s1 = n /\ + bignum_from_memory (inbuf,2) s1' = n) /\ + (exists n. + bignum_from_memory (outbuf,2) s1 = n /\ + bignum_from_memory (outbuf,2) s1' = n))`;; + +(* Now, build the program equivalence statement using + 'mk_equiv_statement_simple'. + Its first argument states the assumption that will appear at + LHS of ' ==> ensures2 ..(equiv statement)..'. + + If it fails, please try `arm_print_log := true`. *) +let equiv_goal = mk_equiv_statement_simple + `ALL (nonoverlapping (outbuf,16)) [ + (word pc:int64, LENGTH mc); + (word pc2:int64, LENGTH mc2); + (inbuf:int64, 16) + ]` + eqin (* Input state equivalence *) + eqout (* Output state equivalence *) + mc (* First program machine code *) + `MAYCHANGE [PC; X10] ,, MAYCHANGE [memory :> bytes (outbuf, 16)] ,, MAYCHANGE [events]` + mc2 (* Second program machine code *) + `MAYCHANGE [PC; X10; X11] ,, MAYCHANGE [memory :> bytes (outbuf, 16)] ,, MAYCHANGE [events]`;; + +(* equiv_goal is: + `forall pc pc2 inbuf outbuf. + ALL (nonoverlapping (outbuf,16)) + [word pc,LENGTH mc; word pc2,LENGTH mc2; inbuf,16] + ==> ensures2 arm + (\(s,s2). + aligned_bytes_loaded s (word pc) mc /\ + read PC s = word pc /\ + aligned_bytes_loaded s2 (word pc2) mc2 /\ + read PC s2 = word pc2 /\ + eqin (s,s2) inbuf outbuf) + (\(s,s2). + aligned_bytes_loaded s (word pc) mc /\ + read PC s = word (pc + 24) /\ + aligned_bytes_loaded s2 (word pc2) mc2 /\ + read PC s2 = word (pc2 + 24) /\ + eqout (s,s2) inbuf outbuf) + (\(s,s2) (s',s2'). + (MAYCHANGE [PC; X10] ,, + MAYCHANGE [memory :> bytes (outbuf,16)] ,, + MAYCHANGE [events]) + s + s' /\ + (MAYCHANGE [PC; X10; X11] ,, + MAYCHANGE [memory :> bytes (outbuf,16)] ,, + MAYCHANGE [events]) + s2 + s2') + (\s. 6) + (\s. 6)` +*) + +(* Line numbers from the second program (mc2) to the first program (mc1). *) +let inst_map = [1;4;2;5;3;6];; + +(* (state number, (equation, fresh var)) *) +let state_to_abbrevs: (int * thm) list ref = ref [];; + +(* Now, let's prove the program equivalence. *) +let EQUIV = prove(equiv_goal, + + (* Rewrite ALL, nonoverlapping, and LENGTH * *) + REWRITE_TAC[ALL;NONOVERLAPPING_CLAUSES; fst EXEC; fst EXEC2] THEN + REPEAT STRIP_TAC THEN + + (** Initialize **) + EQUIV_INITIATE_TAC eqin THEN + RULE_ASSUM_TAC (REWRITE_RULE[BIGNUM_FROM_MEMORY_BYTES]) THEN + + (* Left *) + ARM_N_STEPS_AND_ABBREV_TAC EXEC (1--(List.length inst_map)) + state_to_abbrevs None THEN + + (* Right *) + ARM_N_STEPS_AND_REWRITE_TAC EXEC2 (1--(List.length inst_map)) + inst_map state_to_abbrevs None THEN + + (* Running the statements above step by step will raise an error + message saying that the tactic is not VALID. You can temporarily + disable the message by redefining 'e' as follows: + + let e tac = refine(by(tac));; + + The whole proof ("prove(...)") will still run okay. + *) + + REPEAT_N 2 ENSURES_N_FINAL_STATE_TAC THEN + (* Prove remaining clauses from the postcondition *) + ASM_REWRITE_TAC[] THEN + (* This tactic below is typically fixed and probably you will want to reuse. :) *) + CONJ_TAC THENL [ + (** SUBGOAL 1. Outputs **) + ASM_REWRITE_TAC[eqout; + BIGNUM_EXPAND_CONV `bignum_from_memory (outbuf,2) s`] THEN + REPEAT CONJ_TAC THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + + (** SUBGOAL 2. Maychange pair **) + MONOTONE_MAYCHANGE_CONJ_TAC + ]);; diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac2.S new file mode 100644 index 00000000000..a58c3e01ee7 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_reordertac2.S @@ -0,0 +1,6 @@ +ldr x10, [x0] +ldr x11, [x0, #8] +add x10, x10, #1 +add x11, x11, #2 +str x10, [x1] +str x11, [x1, #8] diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp.S new file mode 100644 index 00000000000..1d1d8f2b40c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp.S @@ -0,0 +1,3 @@ +add x0, x0, #1 +add x1, x1, #2 +add x0, x0, #3 diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp.ml new file mode 100644 index 00000000000..68083f0e302 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp.ml @@ -0,0 +1,95 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + An example that proves equivalence of two straight-line codes. +******************************************************************************) + +(* Please copy this file to the root directory of + s2n-bignum, then follow the instructions. *) + +needs "arm/proofs/equiv.ml";; + +(* Prove that given x0 and x1 are equal, their final results are also equal. *) + +let simp_mc = define_assert_from_elf "simp_mc" "arm/tutorial/rel_simp.o" [ + 0x91000400; (* arm_ADD X0 X0 (rvalue (word 1)) *) + 0x91000821; (* arm_ADD X1 X1 (rvalue (word 2)) *) + 0x91000c00 (* arm_ADD X0 X0 (rvalue (word 3)) *) +];; + +let simp2_mc = define_assert_from_elf "simp2_mc" "arm/tutorial/rel_simp2.o" [ + 0x91001000; (* arm_ADD X0 X0 (rvalue (word 4)) *) + 0x91000821 (* arm_ADD X1 X1 (rvalue (word 2)) *) +];; + +let SIMP_EXEC = ARM_MK_EXEC_RULE simp_mc;; +let SIMP2_EXEC = ARM_MK_EXEC_RULE simp2_mc;; + +(* For relational reasoning, we use predicates and tactics that are slightly + different from those for unary reasoning. *) + +let SIMP_EQUIV = prove( + `forall pc1 pc2 a b. + // Relational hoare triple. + ensures2 arm + // Precondition + (\(s1,s2). aligned_bytes_loaded s1 (word pc1) simp_mc /\ + read PC s1 = word pc1 /\ + aligned_bytes_loaded s2 (word pc2) simp2_mc /\ + read PC s2 = word pc2 /\ + // X0 and X1 start equal. + read X0 s1 = a /\ read X0 s2 = a /\ + read X1 s1 = b /\ read X1 s2 = b) + // Postcondition + (\(s1,s2). aligned_bytes_loaded s1 (word pc1) simp_mc /\ + read PC s1 = word (pc1 + 12) /\ + aligned_bytes_loaded s2 (word pc2) simp2_mc /\ + read PC s2 = word (pc2 + 8) /\ + // They finish with an equal value. + (?k. read X0 s1 = k /\ read X0 s2 = k) /\ + (?k2. read X1 s1 = k2 /\ read X1 s2 = k2)) + // State components that may change. + (\(s1,s2) (s1',s2'). + // PC,X0,X1 may change in the left program + MAYCHANGE [PC;X0;X1] s1 s1' /\ + // .. and in the right program as well. + MAYCHANGE [PC;X0;X1] s2 s2') + // The number of small steps of the 'left' program and 'right' program. + // 'ensures2' needs the number of small steps taken to reach at the + // postcondition. Similarly, 'ensures_n' is a unary predicate similar to + // 'ensures' but takes the number of steps too. 'ensures_n' will not + // appear in this example. + (\s. 3) (\s. 2)`, + + REPEAT STRIP_TAC THEN + (* Start symbolic execution of the two programs! The left program's initial + state is named as s0, and the right is s0'. *) + ENSURES2_INIT_TAC "s0" "s0'" THEN + + (* Symbolically execute the left program only. *) + ARM_N_STUTTER_LEFT_TAC SIMP_EXEC (1--3) None THEN + (* Symbolically execute the right program only. "'" is the suffix of the + state name. *) + ARM_N_STUTTER_RIGHT_TAC SIMP2_EXEC (1--2) "'" None THEN + + (* Let's prove the postcondition. *) + REPEAT_N 2 ENSURES_N_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[] THEN + + CONJ_TAC THENL [ + (* ((?k. word_add a (word 4) = k) + Actually, simplification procedure in symbolic execution tactic already + folded 'word_add (word_add a (word 1)) (word 3)' into + 'word_add a (word 4)'. *) + (* META_EXISTS_TAC is somewhat similar to eexists in Coq. *) + CONJ_TAC THENL [ + META_EXISTS_TAC THEN UNIFY_REFL_TAC; + META_EXISTS_TAC THEN UNIFY_REFL_TAC; + ]; + + (* Maychange pair *) + MONOTONE_MAYCHANGE_CONJ_TAC + ]);; diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp2.S new file mode 100644 index 00000000000..f8d7753cc04 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_simp2.S @@ -0,0 +1,2 @@ +add x0, x0, #4 +add x1, x1, #2 diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq.S new file mode 100644 index 00000000000..e60e6235da0 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq.S @@ -0,0 +1,28 @@ +#define a0 x10 +#define a1 x11 +#define res0 x20 +#define res1 x21 +#define res2 x22 +#define res3 x23 +#define res1t x27 +#define res2t x28 +#define res3t x29 +#define a0a0_hi x12 +#define a0a1_lo x13 +#define a0a1_hi x14 +#define a1a1_lo x15 +#define a1a1_hi x16 + +ldp a0, a1, [x1] +mul res0, a0, a0 +umulh a0a0_hi, a0, a0 +mul a0a1_lo, a0, a1 +umulh a0a1_hi, a0, a1 +mul a1a1_lo, a1, a1 +umulh a1a1_hi, a1, a1 +adds res1t, a0a0_hi, a0a1_lo +adcs res2t, a1a1_lo, a0a1_hi +adc res3t, a1a1_hi, xzr +adds res1, res1t, a0a1_lo +adcs res2, res2t, a0a1_hi +adc res3, res3t, xzr diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq.ml new file mode 100644 index 00000000000..287eada88a9 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq.ml @@ -0,0 +1,148 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + An example that proves the equivalence of + two implementations of 128x128->256-bit squaring. +******************************************************************************) + +needs "arm/proofs/equiv.ml";; +(* neon_helper.ml has lemmas and tactics that are useful to prove programs + manipulating SIMD registers. *) +needs "arm/proofs/neon_helper.ml";; + +(* This is a realistic (and a bit 'dirty') example that shows how equivalence + of vectorization is proven using relational reasoning. + It is always welcome to clean this proof further. *) + +let veceq_mc = define_assert_from_elf "veceq_mc" "arm/tutorial/rel_veceq.o" [ + 0xa9402c2a; (* arm_LDP X10 X11 X1 (Immediate_Offset (iword (&0))) *) + 0x9b0a7d54; (* arm_MUL X20 X10 X10 *) + 0x9bca7d4c; (* arm_UMULH X12 X10 X10 *) + 0x9b0b7d4d; (* arm_MUL X13 X10 X11 *) + 0x9bcb7d4e; (* arm_UMULH X14 X10 X11 *) + 0x9b0b7d6f; (* arm_MUL X15 X11 X11 *) + 0x9bcb7d70; (* arm_UMULH X16 X11 X11 *) + 0xab0d019b; (* arm_ADDS X27 X12 X13 *) + 0xba0e01fc; (* arm_ADCS X28 X15 X14 *) + 0x9a1f021d; (* arm_ADC X29 X16 XZR *) + 0xab0d0375; (* arm_ADDS X21 X27 X13 *) + 0xba0e0396; (* arm_ADCS X22 X28 X14 *) + 0x9a1f03b7 (* arm_ADC X23 X29 XZR *) +];; + +let VECEQ_EXEC = ARM_MK_EXEC_RULE veceq_mc;; + +let veceq2_mc = define_assert_from_elf "veceq2_mc" "arm/tutorial/rel_veceq2.o" [ + 0xa9403c29; (* arm_LDP X9 X15 X1 (Immediate_Offset (iword (&0))) *) + 0x3dc0003e; (* arm_LDR Q30 X1 (Immediate_Offset (word 0)) *) + 0x2ebec3c0; (* arm_UMULL_VEC Q0 Q30 Q30 32 *) + 0x6ebec3c2; (* arm_UMULL2_VEC Q2 Q30 Q30 32 *) + 0x0ea12bd8; (* arm_XTN Q24 Q30 32 *) + 0x4e9e5bde; (* arm_UZP2 Q30 Q30 Q30 32 *) + 0x2eb8c3de; (* arm_UMULL_VEC Q30 Q30 Q24 32 *) + 0x4e083c07; (* arm_UMOV X7 Q0 0 8 *) + 0x4e183c0e; (* arm_UMOV X14 Q0 1 8 *) + 0x4e083c53; (* arm_UMOV X19 Q2 0 8 *) + 0x4e183c56; (* arm_UMOV X22 Q2 1 8 *) + 0x4e083fc4; (* arm_UMOV X4 Q30 0 8 *) + 0x4e183fcc; (* arm_UMOV X12 Q30 1 8 *) + 0xab0484f5; (* arm_ADDS X21 X7 (Shiftedreg X4 LSL 33) *) + 0xd35ffc84; (* arm_LSR X4 X4 31 *) + 0x9a0401ce; (* arm_ADC X14 X14 X4 *) + 0xab0c8673; (* arm_ADDS X19 X19 (Shiftedreg X12 LSL 33) *) + 0xd35ffd84; (* arm_LSR X4 X12 31 *) + 0x9a0402d6; (* arm_ADC X22 X22 X4 *) + 0x9b0f7d24; (* arm_MUL X4 X9 X15 *) + 0x9bcf7d2c; (* arm_UMULH X12 X9 X15 *) + 0xab0405d8; (* arm_ADDS X24 X14 (Shiftedreg X4 LSL 1) *) + 0x93c4fd84; (* arm_EXTR X4 X12 X4 63 *) + 0xba040273; (* arm_ADCS X19 X19 X4 *) + 0xd37ffd84; (* arm_LSR X4 X12 63 *) + 0x9a0402c4 (* arm_ADC X4 X22 X4 *) +];; + +let VECEQ2_EXEC = ARM_MK_EXEC_RULE veceq2_mc;; + + +(* Define the equivalence of input states and output states. *) + +let equiv_input_states = new_definition + `forall s1 s1' x. + (equiv_input_states:(armstate#armstate)->int64->bool) (s1,s1') x <=> + (read X1 s1 = x /\ read X1 s1' = x /\ + exists a. bignum_from_memory (x,2) s1 = a /\ + bignum_from_memory (x,2) s1' = a)`;; + +let equiv_output_states = new_definition + `forall s1 s1'. + (equiv_output_states:(armstate#armstate)->bool) (s1,s1') <=> + (exists a. read X20 s1 = a /\ read X21 s1' = a /\ + (exists b. read X21 s1 = b /\ read X24 s1' = b /\ + (exists c. read X22 s1 = c /\ read X19 s1' = c /\ + (exists d. read X23 s1 = d /\ read X4 s1' = d))))`;; + + +(* Define the equivalence statement which is ensures2 predicate. + Please look at the definition of mk_equiv_statement_simple for full + definitions of its parameters. *) + +let equiv_goal1 = mk_equiv_statement_simple + `T` (* assumption such as nonoverlapping; nothing here, so simply T. *) + equiv_input_states + equiv_output_states + veceq_mc + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X20;X21;X22;X23;X27;X28;X29]` + veceq2_mc + `MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [X19;X21;X22;X24]`;; + +(* 'actions' is a list of line diffs between the two assembly files (textual + form). This isn't important in this example, but to understand when + it is useful you might want to look at proofs in s2n-bignum that use + EQUIV_STEPS_TAC. *) +let actions = [ + ("replace",0,13,0,26) +];; + +(* After every small step, simplify the symbolic expression using + a new custom rewrite rule that is WORD_BITMANIP_SIMP_LEMMAS. *) +extra_word_CONV := + [GEN_REWRITE_CONV I [WORD_BITMANIP_SIMP_LEMMAS]] + @ (!extra_word_CONV);; + + +let VECTORIZE_SQR_EQUIV = prove(equiv_goal1, + + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; + fst VECEQ_EXEC; fst VECEQ2_EXEC] THEN + REPEAT STRIP_TAC THEN + (** Initialize **) + EQUIV_INITIATE_TAC equiv_input_states THEN + RULE_ASSUM_TAC (REWRITE_RULE[BIGNUM_FROM_MEMORY_BYTES]) THEN + + (* Symbolically simulate each program, to the last instructions. *) + EQUIV_STEPS_TAC actions VECEQ_EXEC VECEQ2_EXEC THEN + + (* For some reason, using this additional RULE_ASSUME_TAC was necessary... + Adding these rules to extra_word_CONV didn't work. Yes, this is a 'dirty' + part of the current status (= manual rewrites are sometimes necessary). + Also, these rewrite rules (WORD_SQR128_DIGIT0, ...) are not succinct. + Would be great if their proofs are shorter at least. *) + RULE_ASSUM_TAC (REWRITE_RULE[WORD_SQR128_DIGIT0; + WORD_SQR128_DIGIT1;WORD_SQR128_DIGIT2; + WORD_SQR128_DIGIT3]) THEN + + REPEAT_N 2 ENSURES_N_FINAL_STATE_TAC THEN + (* Prove remaining clauses from the postcondition *) + ASM_REWRITE_TAC[] THEN + CONJ_TAC THENL [ + (* Prove the equivalence! *) + ASM_REWRITE_TAC[equiv_output_states;mk_equiv_regs;mk_equiv_bool_regs] THEN + REPEAT (HINT_EXISTS_REFL_TAC THEN ASM_REWRITE_TAC[]); + + MONOTONE_MAYCHANGE_CONJ_TAC + ]);; diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq2.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq2.S new file mode 100644 index 00000000000..521374de054 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rel_veceq2.S @@ -0,0 +1,26 @@ +ldp x9, x15, [x1] +ldr q30, [x1] +umull v0.2D, v30.2S, v30.2S +umull2 v2.2D, v30.4S, v30.4S +xtn v24.2S, v30.2D +uzp2 v30.4S, v30.4S, v30.4S +umull v30.2D, v30.2S, v24.2S +mov x7, v0.d[0] +mov x14, v0.d[1] +mov x19, v2.d[0] +mov x22, v2.d[1] +mov x4, v30.d[0] +mov x12, v30.d[1] +adds x21, x7, x4, lsl #33 +lsr x4, x4, #31 +adc x14, x14, x4 +adds x19, x19, x12, lsl #33 +lsr x4, x12, #31 +adc x22, x22, x4 +mul x4, x9, x15 +umulh x12, x9, x15 +adds x24, x14, x4, lsl #1 +extr x4, x12, x4, #63 +adcs x19, x19, x4 +lsr x4, x12, #63 +adc x4, x22, x4 //x21,x24,x19,x4 diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rodata.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rodata.S new file mode 100644 index 00000000000..e0554712153 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rodata.S @@ -0,0 +1,109 @@ +/* + This assembly file is a cleaned (and less ABI-compliant) version of GCC + output of the following + C program: + + const int x[10] = {2, 4, 6, 8, 10, 12, 14, 16, 18, 20}; + const int y[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + const int z = 1; + + int f(uint64_t i) { + return x[i] + y[i]; + } + + int g(int64_t i) { + return f(i + z); + } +*/ + +#if defined(__linux__) && defined(__ELF__) +.section .rodata + .global x + .type x, %object + .size x, 40 +#elif defined(__APPLE__) +.const_data +#endif + .align 3 +x: + .word 2 + .word 4 + .word 6 + .word 8 + .word 10 + .word 12 + .word 14 + .word 16 + .word 18 + .word 20 + +#if defined(__linux__) && defined(__ELF__) + .global y + .type y, %object + .size y, 40 +#endif + .align 3 +y: + .word 1 + .word 2 + .word 3 + .word 4 + .word 5 + .word 6 + .word 7 + .word 8 + .word 9 + .word 10 + +#if defined(__linux__) && defined(__ELF__) + .global z + .type z, %object + .size z, 4 +#endif + .align 3 +z: + .word 1 + +.text + .align 2 +#if defined(__linux__) && defined(__ELF__) + .type f, %function +#endif + +f: + mov x3, x0 +#if defined(__linux__) && defined(__ELF__) + adrp x10, x + add x10, x10, :lo12:x +#else + adrp x10, x@PAGE + add x10, x10, x@PAGEOFF +#endif + mov x1, x3 + ldr w1, [x10, x1, lsl 2] +#if defined(__linux__) && defined(__ELF__) + adrp x11, y + add x11, x11, :lo12:y +#else + adrp x11, y@PAGE + add x11, x11, y@PAGEOFF +#endif + mov x2, x3 + ldr w0, [x11, x2, lsl 2] + add w0, w1, w0 + ret + +#if defined(__linux__) && defined(__ELF__) + .type g, %function +#endif +g: +#if defined(__linux__) && defined(__ELF__) + adrp x10, z + add x10, x10, :lo12:z +#else + adrp x10, z@PAGE + add x10, x10, z@PAGEOFF +#endif + ldr w1, [x10] + add x0, x1, x0 + b f diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rodata.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rodata.ml new file mode 100644 index 00000000000..65d92cf6da5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/rodata.ml @@ -0,0 +1,232 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + Verifying a program that reads constant data from .rodata +******************************************************************************) + +needs "arm/proofs/base.ml";; + +(* The following command will print the assertion checker fn of + "arm/tutorial/rodata.o": + + print_literal_relocs_from_elf "arm/tutorial/rodata.o";; + + Or, you can also use + + save_literal_relocs_from_elf "out.txt" "arm/tutorial/rodata.o";; +*) + +let a_mc,a_constants_data = define_assert_relocs_from_elf "a_mc" + "arm/tutorial/rodata.o" +(fun w BL ADR ADRP ADD_rri64 -> [ +(* int f(int) *) + w 0xaa0003e3; (* arm_MOV X3 X0 *) + + (* NOTE: The two entries below have the names of symbols. If they appear as + an empty string on your custom object file, please check whether the + symbols are defined as global in assembly. Local symbols will not have + their names recorded in string table. *) + ADRP (mk_var("x",`:num`),0,4,10); + ADD_rri64 (mk_var("x",`:num`),0,10,10); + + w 0xaa0303e1; (* arm_MOV X1 X3 *) + w 0xb8617941; (* arm_LDR W1 X10 (Shiftreg_Offset X1 2) *) + + ADRP (mk_var("y",`:num`),0,20,11); + ADD_rri64 (mk_var("y",`:num`),0,11,11); + + w 0xaa0303e2; (* arm_MOV X2 X3 *) + w 0xb8627960; (* arm_LDR W0 X11 (Shiftreg_Offset X2 2) *) + w 0x0b000020; (* arm_ADD W0 W1 W0 *) + w 0xd65f03c0; (* arm_RET X30 *) + +(* int g(int) *) + ADRP (mk_var("z",`:num`),0,44,10); + ADD_rri64 (mk_var("z",`:num`),0,10,10); + w 0xb9400141; (* arm_LDR W1 X10 (Immediate_Offset (word 0)) *) + w 0x8b000020; (* arm_ADD X0 X1 X0 *) + w 0x17fffff1 (* arm_B (word 268435396) *) +]);; + +(* Compared to the result of define_asserts_from_elf, the return value of + define_assert_relocs_from_elf has the following differences: + + 1. It returns a_constants_data, which is a list of thm. + Each thm describes a definition of an object in a read-only section: + + # a_constants_data;; + + - : thm list = + [|- z_data = [word 30; word 0; word 0; word 0]; + |- y_data = [word 1; word 0; word 0; word 0; ...]; + |- x_data = [word 2; word 0; word 0; word 0; ...]] + + 2. The returned a_mc is a function that takes the addresses of pc, x, y and + (x and y are the addresses of the two constant arrays) and returns + the corresponding machine code. + + # a_mc;; + + - : thm = + |- forall x pc y. a_mc pc x y = CONS (word 227) (...) +*) + +let EXEC = ARM_MK_EXEC_RULE a_mc;; + +(* Two helper tactics. + 1. INTRO_READ_MEMORY_FROM_BYTES8_TAC t: + If t is `read (memory :> bytesN ...) sM`, prove a theorem + `read (memory :> bytesN ...) sM = ` and introduce it + as an assumption, from the existing `read (memory :> bytes8 ..) sM = ..` + assumptions. + + 2. EXPLODE_BYTELIST_ASSUM_TAC: + Find assumption `read (memory :> bytelist (...)) s = ..` and explode + it to a list of `read (memory :> bytes8 (...)) s = ..` and reintroduce + them as assumptions. +*) +let INTRO_READ_MEMORY_FROM_BYTES8_TAC (t:term) = + (* Convert t into word_joins of 1-byte reads. *) + let r = REWRITE_CONV [READ_MEMORY_BYTESIZED_SPLIT] t in + (* Offset canonicalization, and then rewrite using assumptions *) + let r = REWRITE_RULE[WORD_ADD_ASSOC_CONSTS;WORD_ADD_0;ARITH] r in + MP_TAC r THEN + ASM (GEN_REWRITE_TAC (LAND_CONV o ONCE_DEPTH_CONV)) [] THEN + CONV_TAC (LAND_CONV WORD_REDUCE_CONV) THEN + DISCH_TAC;; + +let EXPLODE_BYTELIST_ASSUM_TAC = + FIRST_X_ASSUM (fun th -> + let _ = find_term (fun t -> name_of t = "bytelist") (concl th) in + (* Unfold the constant arrays! *) + let unfolded_bytes_loaded = REWRITE_RULE a_constants_data th in + (* Fold LENGTH array, and explode arr using BYTELIST_EXPAND_CONV *) + MP_TAC (CONV_RULE (ONCE_DEPTH_CONV LENGTH_CONV THENC + LAND_CONV BYTELIST_EXPAND_CONV) + unfolded_bytes_loaded)) THEN + (* [a;b;..] = [x;y;..] is a = x /\ b = y /\ ... *) + REWRITE_TAC [CONS_11] THEN + STRIP_TAC;; + + +let F_SPEC = prove(`forall x y z i pc retpc. + // These two assumptions state that the distance between symbol x and pc+4 + // (which is the first adrp) do not overflow, and so does symbol y and + // pc+20. + adrp_within_bounds (word x) (word (pc + 4)) /\ + adrp_within_bounds (word y) (word (pc + 20)) /\ + val i < 10 + ==> + ensures arm + (\s. aligned_bytes_loaded s (word pc) (a_mc pc x y z) /\ + read (memory :> bytelist (word x, LENGTH x_data)) s = x_data /\ + read (memory :> bytelist (word y, LENGTH y_data)) s = y_data /\ + read PC s = word pc /\ + read X0 s = i /\ + read X30 s = retpc) + (\s. read W0 s = word (3 * (1 + val i)) /\ + read PC s = retpc) + (MAYCHANGE [X0; X1; X2; X3; X10; X11; PC] ,, MAYCHANGE [events])`, + + REPEAT STRIP_TAC THEN + ENSURES_INIT_TAC "s0" THEN + + (* Let's prove the constant array is storing some structured int sequence. *) + SUBGOAL_THEN + `read (memory :> bytes32 (word_add (word x) (word (4 * val (i:int64))))) s0 = word (2 * (val i+1)) /\ + read (memory :> bytes32 (word_add (word y) (word (4 * val i)))) s0 = word (val i+1)` + MP_TAC THENL [ + + (* Explode the 40-byte constant memory reads into 40 1-bytes! + Do it twice, one for x and one for y. *) + REPEAT_N 2 EXPLODE_BYTELIST_ASSUM_TAC THEN + + (* For each case where i < 10, concretely evaluate the values from the + exploded bytes, proving the equality. *) + ABBREV_TAC `i' = val (i:int64)` THEN + UNDISCH_TAC `i' < 10` THEN + SPEC_TAC (`i':num`,`i':num`) THEN + CONV_TAC EXPAND_CASES_CONV THEN + REWRITE_TAC[ARITH;WORD_ADD_0] THEN + + REPEAT CONJ_TAC THEN (fun (asl,w) -> + INTRO_READ_MEMORY_FROM_BYTES8_TAC (lhs w) (asl,w) + ) THEN ASM_REWRITE_TAC[]; + + ALL_TAC + ] THEN + + STRIP_TAC THEN + + ARM_STEPS_TAC EXEC (1--3) THEN + FIRST_X_ASSUM (fun th -> MP_TAC th THEN IMP_REWRITE_TAC[ADRP_ADD_FOLD] THEN DISCH_TAC) THEN + + ARM_STEPS_TAC EXEC (4--7) THEN + FIRST_X_ASSUM (fun th -> MP_TAC th THEN IMP_REWRITE_TAC[ADRP_ADD_FOLD] THEN DISCH_TAC) THEN + + ARM_STEPS_TAC EXEC (8--11) THEN + + (* Prove the postcondition. *) + ENSURES_FINAL_STATE_TAC THEN + + ASM_REWRITE_TAC[WREG_EXPAND_CLAUSES;READ_ZEROTOP_32] THEN + REWRITE_TAC[WORD_BLAST`word_zx (word_zx (x:(32)word):(64)word):(32)word = x`] THEN + CONV_TAC WORD_RULE);; + + +(* Proving the specification of function g(i) that calls f(i + z). *) + +let G_SPEC = prove(`forall x y z i pc retpc. + adrp_within_bounds (word x) (word (pc + 4)) /\ + adrp_within_bounds (word y) (word (pc + 20)) /\ + adrp_within_bounds (word z) (word (pc + 44)) /\ + val i < 9 + ==> + ensures arm + (\s. aligned_bytes_loaded s (word pc) (a_mc pc x y z) /\ + read (memory :> bytelist (word x, LENGTH x_data)) s = x_data /\ + read (memory :> bytelist (word y, LENGTH y_data)) s = y_data /\ + read (memory :> bytelist (word z, LENGTH z_data)) s = z_data /\ + read PC s = word (pc + 0x2c) /\ + read X0 s = i /\ + read X30 s = retpc) + (\s. read W0 s = word (3 * (2 + val i)) /\ + read PC s = retpc) + (MAYCHANGE [X0; X1; X2; X3; X10; X11; PC] ,, MAYCHANGE [events])`, + + REPEAT STRIP_TAC THEN + + ENSURES_INIT_TAC "s0" THEN + + ARM_STEPS_TAC EXEC (1--2) THEN + FIRST_X_ASSUM (fun th -> MP_TAC th THEN IMP_REWRITE_TAC[ADRP_ADD_FOLD] THEN DISCH_TAC) THEN + + (* Prepare load z. *) + EXPLODE_BYTELIST_ASSUM_TAC THEN + INTRO_READ_MEMORY_FROM_BYTES8_TAC + `read (memory :> bytes32 (word z)) s2` THEN + (* Expand read W0 to read X0. *) + RULE_ASSUM_TAC(REWRITE_RULE[WREG_EXPAND_CLAUSES;READ_ZEROTOP_32]) THEN + ARM_STEPS_TAC EXEC (3--4) THEN + + SUBGOAL_THEN `val (word_add (word 1) i:int64) < 10` ASSUME_TAC THENL [ + REWRITE_TAC[VAL_WORD_ADD;VAL_WORD;DIMINDEX_64] THEN ASM_ARITH_TAC; + ALL_TAC + ] THEN + ARM_STEPS_TAC EXEC [5] THEN + + (* Call ARM_SUBROUTINE_SIM_TAC with its arguments. *) + ARM_SUBROUTINE_SIM_TAC + (SPEC_ALL a_mc,EXEC,0,SPEC_ALL a_mc,F_SPEC) + [`x:num`;`y:num`;`z:num`;`read X0 s`; + `pc:num`; `read X30 s`] 6 THEN + + (* Prove the postcondition. *) + ENSURES_FINAL_STATE_TAC THEN + + ASM_REWRITE_TAC[VAL_WORD_ADD;DIMINDEX_64] THEN + AP_TERM_TAC THEN CONV_TAC WORD_REDUCE_CONV THEN + IMP_REWRITE_TAC[MOD_LT] THEN ASM_ARITH_TAC);; diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/sequence.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/sequence.S new file mode 100644 index 00000000000..c3a16766210 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/sequence.S @@ -0,0 +1,4 @@ +add x1, x1, x0 +add x2, x2, x0 +mov x3, #2 +mul x1, x1, x3 diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/sequence.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/sequence.ml new file mode 100644 index 00000000000..51dadf98bb1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/sequence.ml @@ -0,0 +1,101 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + Prove a property of a simple program by splitting into two sequential + chunks with an intermediate assertion. +******************************************************************************) + +(* Please copy this file to the root directory of s2n-bignum, then + follow the instructions. *) + +needs "arm/proofs/base.ml";; + +(* Given a program + 0: 8b000021 add x1, x1, x0 + 4: 8b000042 add x2, x2, x0 + 8: d2800043 mov x3, #0x2 + c: 9b037c21 mul x1, x1, x3 + + Let's prove that x1 in the final state is (x1 + x0) * 2. + As done in "simple.ml", this can be done using symbolic execution. However, + in this file, we will try a slightly different approach: + (1) The program will be splitted into two smaller programs: + + First prog: + 0: 8b000021 add x1, x1, x0 + 4: 8b000042 add x2, x2, x0 + + Second prog: + 8: d2800043 mov x3, #0x2 + c: 9b037c21 mul x1, x1, x3 + + (2) Each program will have its 'ensures' predicate specifying the pre and + postcondition. The postcondition of the first program will be equivalent to + the second one. + (3) By proving the two 'ensures' predicate, the specification of whole + program can be proven. +*) + +let sequence_mc = new_definition `sequence_mc = [ + word 0x21; word 0x00; word 0x00; word 0x8b; // add x2, x1, x0 + word 0x42; word 0x00; word 0x00; word 0x8b; // add x2, x2, x0 + word 0x43; word 0x00; word 0x80; word 0xd2; // mov x3, #0x2 + word 0x21; word 0x7c; word 0x03; word 0x9b // mul x1, x1, x3 + ]:((8)word)list`;; + +let EXEC = ARM_MK_EXEC_RULE sequence_mc;; + +let sequence_SPEC = prove( + `forall pc a b. + ensures arm + // Precondition + (\s. aligned_bytes_loaded s (word pc) sequence_mc /\ + read PC s = word pc /\ + read X0 s = word a /\ + read X1 s = word b /\ + read X2 s = word c) + // Postcondition + (\s. read PC s = word (pc+16) /\ + read X1 s = word ((a + b) * 2)) + // Registers (and memory locations) that may change after execution + (MAYCHANGE [PC;X1;X2;X3])`, + (* Strips the outermost universal quantifier from the conclusion of a goal *) + REPEAT STRIP_TAC THEN + + (* Use ENSURES_SEQUENCE_TAC to split the program into two chunks: + [pc, pc+8) and [pc+8, pc+16). The second argument of the tactic + `\s. read X1 s = word (a + b)` is a lambda function stating the + intermediate state at pc+8. + The result of this tactic will be a conjunction of two ensures, + the left clause of which is a spec of the first chunk and the + right clause is the right one. *) + ENSURES_SEQUENCE_TAC + `pc + 8` + `\s. read X1 s = word (a + b)` THEN + + (* Split the conjunction and create two subgoals. *) + CONJ_TAC THENL [ + (* The first subgoal. *) + (* Now we can use the symbolic execution tactics introduced in "simple.ml". *) + (* Start symbolic execution with state 's0' *) + ENSURES_INIT_TAC "s0" THEN + (* Symbolically run two instructions *) + ARM_STEPS_TAC EXEC (1--2) THEN + (* Try to prove the postcondition and frame as much as possible *) + ENSURES_FINAL_STATE_TAC THEN + (* Use ASM_REWRITE_TAC[] to rewrite the goal using equalities in assumptions. *) + ASM_REWRITE_TAC[] THEN + (* Prove: `word_add (word b) (word a) = word (a + b)` *) + CONV_TAC WORD_RULE; + + (* The second subgoal *) + ENSURES_INIT_TAC "s0" THEN + ARM_STEPS_TAC EXEC (1--2) THEN + ENSURES_FINAL_STATE_TAC THEN + ASM_REWRITE_TAC[] THEN + (* Prove: `word (0 + val (word (a + b)) * 2) = word ((a + b) * 2)` *) + CONV_TAC WORD_RULE; + ]);; diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/simple.S b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/simple.S new file mode 100644 index 00000000000..9439996e1af --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/simple.S @@ -0,0 +1,2 @@ +add x2, x1, x0 +sub x2, x2, x1 diff --git a/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/simple.ml b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/simple.ml new file mode 100644 index 00000000000..df22765e54b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/arm/tutorial/simple.ml @@ -0,0 +1,107 @@ +(* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(****************************************************************************** + Proving a simple property about program 'simple.S' +******************************************************************************) + +(* Please copy this file to the root directory of s2n-bignum, then + follow the instructions. *) + +needs "arm/proofs/base.ml";; + +(* Let's prove a simple property of the following program: + + 0: 8b000022 add x2, x1, x0 + 4: cb010042 sub x2, x2, x1 + + Let's start with defining a byte sequence of a program 'simple.S' +*) +let simple_mc = new_definition `simple_mc = [ + word 0x22; word 0x00; word 0x00; word 0x8b; // add x2, x1, x0 + word 0x42; word 0x00; word 0x01; word 0xcb // sub x2, x2, x1 + ]:((8)word)list`;; + +(* Or, you can read .o file and store the byte list as follows: +let simple_mc = define_assert_from_elf "simple_mc" "arm/tutorial/simple.o" +[ + 0x8b000022; (* arm_ADD X2 X1 X0 *) + 0xcb010042 (* arm_SUB X2 X2 X1 *) +];; + +You can get the above OCaml list data structure from +`print_literal_from_elf "<.o file>"` or `save_literal_from_elf "" +"<.o file>"`. +*) + +(* ARM_MK_EXEC_RULE decodes the byte sequence into conjunction of + equalities between the bytes and instructions. *) +let EXEC = ARM_MK_EXEC_RULE simple_mc;; + +(* + In s2n-bignum, a specification (ensures) has three components: + 1. precondition: assume that a program starts from some program state satisfying the critera + 2. postcondition: the program must reach to a program state satisfying the criteria + 3. frame: the start program state and end program state must satisfy this relation + (e.g., this program only changes callee-save register) + In this file, + 1. precondition is: + - the 'simple' binary is loaded at some location in memory, say 'pc' + - the arm program counter register, PC, has value pc + - the arm register X0 has a symbolic value a and X1 has a symbolic value b + 2. postcondition is: + - the arm program counter register, PC, has value pc+8 + (meaning that two instructions have been executed) + - the arm register X2 has value b + 3. frame is: + - the register values of PC and X2 might have been changed + + If you are using the VSCode plugin of HOL Light, you can ctrl+click + (cmd+click for Mac) to jump to definitions. +*) +let SIMPLE_SPEC = prove( + `forall pc a b. + ensures arm + // Precondition + (\s. // aligned_bytes_loaded states that a byte sequence 'simple_mc' + // is loaded at memory location 'pc' in the state 's' and also + // 4-bytes aligned. + aligned_bytes_loaded s (word pc) simple_mc /\ + // 'word' is a bit-vector type in HOL Light. + // 'word a' means it is a bit-vector whose numeral (:num type) + // is 'a'. Its bit-width is inferred as 64 bits here, but it can + // be manually annotated as (word a:(64)word). + read PC s = word pc /\ + read X0 s = word a /\ + read X1 s = word b) + // Postcondition + (\s. read PC s = word (pc+8) /\ + read X2 s = word a) + // Registers (and memory locations) that may change after execution + (MAYCHANGE [PC;X2])`, + + (* Strips the outermost universal quantifier from the conclusion of a goal *) + REPEAT STRIP_TAC THEN + (* Start symbolic execution with state 's0' *) + ENSURES_INIT_TAC "s0" THEN + + (* Symbolically run two instructions *) + ARM_STEPS_TAC EXEC (1--2) THEN + (* Try to prove the postcondition and frame as much as possible *) + ENSURES_FINAL_STATE_TAC THEN + + (* Use ASM_REWRITE_TAC[] to rewrite the goal using equalities in assumptions. *) + ASM_REWRITE_TAC[] THEN + (* We need to prove this: + `word_sub (word_add (word b) (word a)) (word b) = word a` + Use an automated prover for words in HOL Light *) + CONV_TAC WORD_RULE);; + +(* Note that symbolic simulator will discard the output of instructions + if its inputs do not have their symbolic expressions defined in assumption. + To list which instructions are discarded by the simulation tactic. + set: + arm_print_log := true;; + This flag will also print helpful informations that are useful. *) diff --git a/third_party/s2n-bignum/s2n-bignum-imported/include/_internal_s2n_bignum.h b/third_party/s2n-bignum/s2n-bignum-imported/include/_internal_s2n_bignum.h new file mode 100644 index 00000000000..98181a5779b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/include/_internal_s2n_bignum.h @@ -0,0 +1,41 @@ + +#ifdef __APPLE__ +# define S2N_BN_SYMBOL(NAME) _##NAME +# if defined(__AARCH64EL__) || defined(__ARMEL__) +# define __LF %% +# else +# define __LF ; +# endif +#else +# define S2N_BN_SYMBOL(name) name +# define __LF ; +#endif + +#define S2N_BN_SYM_VISIBILITY_DIRECTIVE(name) .globl S2N_BN_SYMBOL(name) +#ifdef S2N_BN_HIDE_SYMBOLS +# ifdef __APPLE__ +# define S2N_BN_SYM_PRIVACY_DIRECTIVE(name) .private_extern S2N_BN_SYMBOL(name) +# else +# define S2N_BN_SYM_PRIVACY_DIRECTIVE(name) .hidden S2N_BN_SYMBOL(name) +# endif +#else +# define S2N_BN_SYM_PRIVACY_DIRECTIVE(name) /* NO-OP: S2N_BN_SYM_PRIVACY_DIRECTIVE */ +#endif + +// Enable indirect branch tracking support unless explicitly disabled +// with -DNO_IBT. If the platform supports CET, simply inherit this from +// the usual header. Otherwise manually define _CET_ENDBR, used at each +// x86 entry point, to be the ENDBR64 instruction, with an explicit byte +// sequence for compilers/assemblers that don't know about it. Note that +// it is safe to use ENDBR64 on all platforms, since the encoding is by +// design interpreted as a NOP on all pre-CET x86_64 processors. The only +// downside is a small increase in code size and potentially a modest +// slowdown from executing one more instruction. + +#if NO_IBT +#define _CET_ENDBR +#elif defined(__CET__) +#include +#else +#define _CET_ENDBR .byte 0xf3,0x0f,0x1e,0xfa +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum-c89.h b/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum-c89.h new file mode 100644 index 00000000000..ca9bec37c72 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum-c89.h @@ -0,0 +1,1114 @@ +/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + */ + +/* ---------------------------------------------------------------------------- + * C prototypes for s2n-bignum functions, so you can use them in C programs via + * + * #include "s2n-bignum-c89.h" + * + * The functions are listed in alphabetical order with a brief description + * in comments for each one. For more detailed documentation see the comment + * banner at the top of the corresponding assembly (.S) file, and + * for the last word in what properties it satisfies see the spec in the + * formal proof (the .ml file in the architecture-specific directory). + * + * For some functions there are additional variants with names ending in + * "_alt". These have the same core mathematical functionality as their + * non-"alt" versions, but can be better suited to some microarchitectures: + * + * - On x86, the "_alt" forms avoid BMI and ADX instruction set + * extensions, so will run on any x86_64 machine, even older ones + * + * - On ARM, the "_alt" forms target machines with higher multiplier + * throughput, generally offering higher performance there. + * ---------------------------------------------------------------------------- + */ + +/* Add, z := x + y */ +/* Inputs x[m], y[n]; outputs function return (carry-out) and z[p] */ +extern uint64_t bignum_add (uint64_t p, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); + +/* Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_add_p256 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_add_p25519 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_add_p256k1 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced */ +/* Inputs x[6], y[6]; output z[6] */ +extern void bignum_add_p384 (uint64_t z[6], uint64_t x[6], uint64_t y[6]); + +/* Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced */ +/* Inputs x[9], y[9]; output z[9] */ +extern void bignum_add_p521 (uint64_t z[9], uint64_t x[9], uint64_t y[9]); + +/* Add modulo p_sm2, z := (x + y) mod p_sm2, assuming x and y reduced */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_add_sm2 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Compute "amontification" constant z :== 2^{128k} (congruent mod m) */ +/* Input m[k]; output z[k]; temporary buffer t[>=k] */ +extern void bignum_amontifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t); + +/* Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m) */ +/* Inputs x[k], y[k], m[k]; output z[k] */ +extern void bignum_amontmul (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); + +/* Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m) */ +/* Inputs x[n], m[k], p; output z[k] */ +extern void bignum_amontredc (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t *m, uint64_t p); + +/* Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m) */ +/* Inputs x[k], m[k]; output z[k] */ +extern void bignum_amontsqr (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); + +/* Convert 4-digit (256-bit) bignum to/from big-endian form */ +/* Input x[4]; output z[4] */ +extern void bignum_bigendian_4 (uint64_t z[4], uint64_t x[4]); + +/* Convert 6-digit (384-bit) bignum to/from big-endian form */ +/* Input x[6]; output z[6] */ +extern void bignum_bigendian_6 (uint64_t z[6], uint64_t x[6]); + +/* Select bitfield starting at bit n with length l <= 64 */ +/* Inputs x[k], n, l; output function return */ +extern uint64_t bignum_bitfield (uint64_t k, uint64_t *x, uint64_t n, uint64_t l); + +/* Return size of bignum in bits */ +/* Input x[k]; output function return */ +extern uint64_t bignum_bitsize (uint64_t k, uint64_t *x); + +/* Divide by a single (nonzero) word, z := x / m and return x mod m */ +/* Inputs x[n], m; outputs function return (remainder) and z[k] */ +extern uint64_t bignum_cdiv (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m); + +/* Divide by a single word, z := x / m when known to be exact */ +/* Inputs x[n], m; output z[k] */ +extern void bignum_cdiv_exact (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m); + +/* Count leading zero digits (64-bit words) */ +/* Input x[k]; output function return */ +extern uint64_t bignum_cld (uint64_t k, uint64_t *x); + +/* Count leading zero bits */ +/* Input x[k]; output function return */ +extern uint64_t bignum_clz (uint64_t k, uint64_t *x); + +/* Multiply-add with single-word multiplier, z := z + c * y */ +/* Inputs c, y[n]; outputs function return (carry-out) and z[k] */ +extern uint64_t bignum_cmadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); + +/* Negated multiply-add with single-word multiplier, z := z - c * y */ +/* Inputs c, y[n]; outputs function return (negative carry-out) and z[k] */ +extern uint64_t bignum_cmnegadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); + +/* Find modulus of bignum w.r.t. single nonzero word m, returning x mod m */ +/* Input x[k], m; output function return */ +extern uint64_t bignum_cmod (uint64_t k, uint64_t *x, uint64_t m); + +/* Multiply by a single word, z := c * y */ +/* Inputs c, y[n]; outputs function return (carry-out) and z[k] */ +extern uint64_t bignum_cmul (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); + +/* Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming x reduced */ +/* Inputs c, x[4]; output z[4] */ +extern void bignum_cmul_p25519 (uint64_t z[4], uint64_t c, uint64_t x[4]); +extern void bignum_cmul_p25519_alt (uint64_t z[4], uint64_t c, uint64_t x[4]); + +/* Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming x reduced */ +/* Inputs c, x[4]; output z[4] */ +extern void bignum_cmul_p256 (uint64_t z[4], uint64_t c, uint64_t x[4]); +extern void bignum_cmul_p256_alt (uint64_t z[4], uint64_t c, uint64_t x[4]); + +/* Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming x reduced */ +/* Inputs c, x[4]; output z[4] */ +extern void bignum_cmul_p256k1 (uint64_t z[4], uint64_t c, uint64_t x[4]); +extern void bignum_cmul_p256k1_alt (uint64_t z[4], uint64_t c, uint64_t x[4]); + +/* Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming x reduced */ +/* Inputs c, x[6]; output z[6] */ +extern void bignum_cmul_p384 (uint64_t z[6], uint64_t c, uint64_t x[6]); +extern void bignum_cmul_p384_alt (uint64_t z[6], uint64_t c, uint64_t x[6]); + +/* Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming x reduced */ +/* Inputs c, x[9]; output z[9] */ +extern void bignum_cmul_p521 (uint64_t z[9], uint64_t c, uint64_t x[9]); +extern void bignum_cmul_p521_alt (uint64_t z[9], uint64_t c, uint64_t x[9]); + +/* Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming x reduced */ +/* Inputs c, x[4]; output z[4] */ +extern void bignum_cmul_sm2 (uint64_t z[4], uint64_t c, uint64_t x[4]); +extern void bignum_cmul_sm2_alt (uint64_t z[4], uint64_t c, uint64_t x[4]); + +/* Test bignums for coprimality, gcd(x,y) = 1 */ +/* Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)] */ +extern uint64_t bignum_coprime (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y, uint64_t *t); + +/* Copy bignum with zero-extension or truncation, z := x */ +/* Input x[n]; output z[k] */ +extern void bignum_copy (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x); + +/* Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] */ +/* into z[0..width-1]. */ +/* This function is constant-time with respect to the value of `idx`. This is */ +/* achieved by reading the whole table and using the bit-masking to get the */ +/* `idx`-th row. */ +/* Input table[height*width]; output z[width] */ +extern void bignum_copy_row_from_table (uint64_t *z, uint64_t *table, uint64_t height, + uint64_t width, uint64_t idx); + +/* Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] */ +/* into z[0..width-1]. width must be a multiple of 8. */ +/* This function is constant-time with respect to the value of `idx`. This is */ +/* achieved by reading the whole table and using the bit-masking to get the */ +/* `idx`-th row. */ +/* Input table[height*width]; output z[width] */ +extern void bignum_copy_row_from_table_8n (uint64_t *z, uint64_t *table, + uint64_t height, uint64_t width, uint64_t idx); + +/* Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] into z[0..row-1]. */ +/* This function is constant-time with respect to the value of `idx`. This is */ +/* achieved by reading the whole table and using the bit-masking to get the */ +/* `idx`-th row. */ +/* Input table[height*16]; output z[16] */ +extern void bignum_copy_row_from_table_16 (uint64_t *z, uint64_t *table, + uint64_t height, uint64_t idx); + +/* Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1] into z[0..row-1]. */ +/* This function is constant-time with respect to the value of `idx`. This is */ +/* achieved by reading the whole table and using the bit-masking to get the */ +/* `idx`-th row. */ +/* Input table[height*32]; output z[32] */ +extern void bignum_copy_row_from_table_32 (uint64_t *z, uint64_t *table, + uint64_t height, uint64_t idx); + +/* Count trailing zero digits (64-bit words) */ +/* Input x[k]; output function return */ +extern uint64_t bignum_ctd (uint64_t k, uint64_t *x); + +/* Count trailing zero bits */ +/* Input x[k]; output function return */ +extern uint64_t bignum_ctz (uint64_t k, uint64_t *x); + +/* Convert from almost-Montgomery form, z := (x / 2^256) mod p_256 */ +/* Input x[4]; output z[4] */ +extern void bignum_deamont_p256 (uint64_t z[4], uint64_t x[4]); +extern void bignum_deamont_p256_alt (uint64_t z[4], uint64_t x[4]); + +/* Convert from almost-Montgomery form, z := (x / 2^256) mod p_256k1 */ +/* Input x[4]; output z[4] */ +extern void bignum_deamont_p256k1 (uint64_t z[4], uint64_t x[4]); + +/* Convert from almost-Montgomery form, z := (x / 2^384) mod p_384 */ +/* Input x[6]; output z[6] */ +extern void bignum_deamont_p384 (uint64_t z[6], uint64_t x[6]); +extern void bignum_deamont_p384_alt (uint64_t z[6], uint64_t x[6]); + +/* Convert from almost-Montgomery form z := (x / 2^576) mod p_521 */ +/* Input x[9]; output z[9] */ +extern void bignum_deamont_p521 (uint64_t z[9], uint64_t x[9]); + +/* Convert from almost-Montgomery form z := (x / 2^256) mod p_sm2 */ +/* Input x[4]; output z[4] */ +extern void bignum_deamont_sm2 (uint64_t z[4], uint64_t x[4]); + +/* Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m */ +/* Inputs x[k], m[k]; output z[k] */ +extern void bignum_demont (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); + +/* Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_demont_p256 (uint64_t z[4], uint64_t x[4]); +extern void bignum_demont_p256_alt (uint64_t z[4], uint64_t x[4]); + +/* Convert from Montgomery form z := (x / 2^256) mod p_256k1, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_demont_p256k1 (uint64_t z[4], uint64_t x[4]); + +/* Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced */ +/* Input x[6]; output z[6] */ +extern void bignum_demont_p384 (uint64_t z[6], uint64_t x[6]); +extern void bignum_demont_p384_alt (uint64_t z[6], uint64_t x[6]); + +/* Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced */ +/* Input x[9]; output z[9] */ +extern void bignum_demont_p521 (uint64_t z[9], uint64_t x[9]); + +/* Convert from Montgomery form z := (x / 2^256) mod p_sm2, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_demont_sm2 (uint64_t z[4], uint64_t x[4]); + +/* Select digit x[n] */ +/* Inputs x[k], n; output function return */ +extern uint64_t bignum_digit (uint64_t k, uint64_t *x, uint64_t n); + +/* Return size of bignum in digits (64-bit word) */ +/* Input x[k]; output function return */ +extern uint64_t bignum_digitsize (uint64_t k, uint64_t *x); + +/* Divide bignum by 10: z' := z div 10, returning remainder z mod 10 */ +/* Inputs z[k]; outputs function return (remainder) and z[k] */ +extern uint64_t bignum_divmod10 (uint64_t k, uint64_t *z); + +/* Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_double_p25519 (uint64_t z[4], uint64_t x[4]); + +/* Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_double_p256 (uint64_t z[4], uint64_t x[4]); + +/* Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_double_p256k1 (uint64_t z[4], uint64_t x[4]); + +/* Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced */ +/* Input x[6]; output z[6] */ +extern void bignum_double_p384 (uint64_t z[6], uint64_t x[6]); + +/* Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced */ +/* Input x[9]; output z[9] */ +extern void bignum_double_p521 (uint64_t z[9], uint64_t x[9]); + +/* Double modulo p_sm2, z := (2 * x) mod p_sm2, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_double_sm2 (uint64_t z[4], uint64_t x[4]); + +/* Extended Montgomery reduce, returning results in input-output buffer */ +/* Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] */ +extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w); + +/* Extended Montgomery reduce in 8-digit blocks, results in input-output buffer */ +/* Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] */ +extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w); +// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] +// Temporary buffer m_precalc[12*(k/4-1)] +extern uint64_t bignum_emontredc_8n_cdiff (uint64_t k, uint64_t *z, uint64_t *m, + uint64_t w, uint64_t *m_precalc); +/* Test bignums for equality, x = y */ +/* Inputs x[m], y[n]; output function return */ +extern uint64_t bignum_eq (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); + +/* Test bignum for even-ness */ +/* Input x[k]; output function return */ +extern uint64_t bignum_even (uint64_t k, uint64_t *x); + +/* Convert 4-digit (256-bit) bignum from big-endian bytes */ +/* Input x[32] (bytes); output z[4] */ +extern void bignum_frombebytes_4 (uint64_t z[4], uint8_t x[32]); + +/* Convert 6-digit (384-bit) bignum from big-endian bytes */ +/* Input x[48] (bytes); output z[6] */ +extern void bignum_frombebytes_6 (uint64_t z[6], uint8_t x[48]); + +/* Convert 4-digit (256-bit) bignum from little-endian bytes */ +/* Input x[32] (bytes); output z[4] */ +extern void bignum_fromlebytes_4 (uint64_t z[4], uint8_t x[32]); + +/* Convert 6-digit (384-bit) bignum from little-endian bytes */ +/* Input x[48] (bytes); output z[6] */ +extern void bignum_fromlebytes_6 (uint64_t z[6], uint8_t x[48]); + +/* Convert little-endian bytes to 9-digit 528-bit bignum */ +/* Input x[66] (bytes); output z[9] */ +extern void bignum_fromlebytes_p521 (uint64_t z[9],uint8_t x[66]); + +/* Compare bignums, x >= y */ +/* Inputs x[m], y[n]; output function return */ +extern uint64_t bignum_ge (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); + +/* Compare bignums, x > y */ +/* Inputs x[m], y[n]; output function return */ +extern uint64_t bignum_gt (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); + +/* Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_half_p256 (uint64_t z[4], uint64_t x[4]); + +/* Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_half_p256k1 (uint64_t z[4], uint64_t x[4]); + +/* Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced */ +/* Input x[6]; output z[6] */ +extern void bignum_half_p384 (uint64_t z[6], uint64_t x[6]); + +/* Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced */ +/* Input x[9]; output z[9] */ +extern void bignum_half_p521 (uint64_t z[9], uint64_t x[9]); + +/* Halve modulo p_sm2, z := (x / 2) mod p_sm2, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_half_sm2 (uint64_t z[4], uint64_t x[4]); + +/* Modular inverse modulo p_25519 = 2^255 - 19 */ +/* Input x[4]; output z[4] */ +extern void bignum_inv_p25519(uint64_t z[4],uint64_t x[4]); + +/* Modular inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 */ +/* Input x[4]; output z[4] */ +extern void bignum_inv_p256(uint64_t z[4],uint64_t x[4]); + +/* Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 */ +/* Input x[6]; output z[6] */ +extern void bignum_inv_p384(uint64_t z[6],uint64_t x[6]); + +/* Modular inverse modulo p_521 = 2^521 - 1 */ +/* Input x[9]; output z[9] */ +extern void bignum_inv_p521(uint64_t z[9],uint64_t x[9]); + +/* Modular inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1 */ +/* Input x[4]; output z[4] */ +extern void bignum_inv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],uint64_t x[S2N_BIGNUM_STATIC 4]); + +/* Inverse square root modulo p_25519 */ +/* Input x[4]; output function return (Legendre symbol) and z[4] */ +extern int64_t bignum_invsqrt_p25519(uint64_t z[4],uint64_t x[4]); +extern int64_t bignum_invsqrt_p25519_alt(uint64_t z[4],uint64_t x[4]); + +/* Test bignum for zero-ness, x = 0 */ +/* Input x[k]; output function return */ +extern uint64_t bignum_iszero (uint64_t k, uint64_t *x); + +/* Multiply z := x * y */ +/* Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32] */ +extern void bignum_kmul_16_32 (uint64_t z[32], uint64_t x[16], uint64_t y[16], uint64_t t[32]); + +/* Multiply z := x * y */ +/* Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96] */ +extern void bignum_kmul_32_64 (uint64_t z[64], uint64_t x[32], uint64_t y[32], uint64_t t[96]); + +/* Square, z := x^2 */ +/* Input x[16]; output z[32]; temporary buffer t[>=24] */ +extern void bignum_ksqr_16_32 (uint64_t z[32], uint64_t x[16], uint64_t t[24]); + +/* Square, z := x^2 */ +/* Input x[32]; output z[64]; temporary buffer t[>=72] */ +extern void bignum_ksqr_32_64 (uint64_t z[64], uint64_t x[32], uint64_t t[72]); + +/* Compare bignums, x <= y */ +/* Inputs x[m], y[n]; output function return */ +extern uint64_t bignum_le (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); + +/* Convert 4-digit (256-bit) bignum to/from little-endian form */ +/* Input x[4]; output z[4] */ +extern void bignum_littleendian_4 (uint64_t z[4], uint64_t x[4]); + +/* Convert 6-digit (384-bit) bignum to/from little-endian form */ +/* Input x[6]; output z[6] */ +extern void bignum_littleendian_6 (uint64_t z[6], uint64_t x[6]); + +/* Compare bignums, x < y */ +/* Inputs x[m], y[n]; output function return */ +extern uint64_t bignum_lt (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); + +/* Multiply-add, z := z + x * y */ +/* Inputs x[m], y[n]; outputs function return (carry-out) and z[k] */ +extern uint64_t bignum_madd (uint64_t k, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); + +/* Multiply-add modulo the order of the curve25519/edwards25519 basepoint */ +/* Inputs x[4], y[4], c[4]; output z[4] */ +extern void bignum_madd_n25519 (uint64_t z[4], uint64_t x[4], uint64_t y[4], uint64_t c[4]); +extern void bignum_madd_n25519_alt (uint64_t z[4], uint64_t x[4], uint64_t y[4], uint64_t c[4]); + +/* Reduce modulo group order, z := x mod m_25519 */ +/* Input x[4]; output z[4] */ +extern void bignum_mod_m25519_4 (uint64_t z[4], uint64_t x[4]); + +/* Reduce modulo basepoint order, z := x mod n_25519 */ +/* Input x[k]; output z[4] */ +extern void bignum_mod_n25519 (uint64_t z[4], uint64_t k, uint64_t *x); + +/* Reduce modulo basepoint order, z := x mod n_25519 */ +/* Input x[4]; output z[4] */ +extern void bignum_mod_n25519_4 (uint64_t z[4], uint64_t x[4]); + +/* Reduce modulo group order, z := x mod n_256 */ +/* Input x[k]; output z[4] */ +extern void bignum_mod_n256 (uint64_t z[4], uint64_t k, uint64_t *x); +extern void bignum_mod_n256_alt (uint64_t z[4], uint64_t k, uint64_t *x); + +/* Reduce modulo group order, z := x mod n_256 */ +/* Input x[4]; output z[4] */ +extern void bignum_mod_n256_4 (uint64_t z[4], uint64_t x[4]); + +/* Reduce modulo group order, z := x mod n_256k1 */ +/* Input x[4]; output z[4] */ +extern void bignum_mod_n256k1_4 (uint64_t z[4], uint64_t x[4]); + +/* Reduce modulo group order, z := x mod n_384 */ +/* Input x[k]; output z[6] */ +extern void bignum_mod_n384 (uint64_t z[6], uint64_t k, uint64_t *x); +extern void bignum_mod_n384_alt (uint64_t z[6], uint64_t k, uint64_t *x); + +/* Reduce modulo group order, z := x mod n_384 */ +/* Input x[6]; output z[6] */ +extern void bignum_mod_n384_6 (uint64_t z[6], uint64_t x[6]); + +/* Reduce modulo group order, z := x mod n_521 */ +/* Input x[9]; output z[9] */ +extern void bignum_mod_n521_9 (uint64_t z[9], uint64_t x[9]); +extern void bignum_mod_n521_9_alt (uint64_t z[9], uint64_t x[9]); + +/* Reduce modulo group order, z := x mod n_sm2 */ +/* Input x[k]; output z[4] */ +extern void bignum_mod_nsm2 (uint64_t z[4], uint64_t k, uint64_t *x); +extern void bignum_mod_nsm2_alt (uint64_t z[4], uint64_t k, uint64_t *x); + +/* Reduce modulo group order, z := x mod n_sm2 */ +/* Input x[4]; output z[4] */ +extern void bignum_mod_nsm2_4 (uint64_t z[4], uint64_t x[4]); + +/* Reduce modulo field characteristic, z := x mod p_25519 */ +/* Input x[4]; output z[4] */ +extern void bignum_mod_p25519_4 (uint64_t z[4], uint64_t x[4]); + +/* Reduce modulo field characteristic, z := x mod p_256 */ +/* Input x[k]; output z[4] */ +extern void bignum_mod_p256 (uint64_t z[4], uint64_t k, uint64_t *x); +extern void bignum_mod_p256_alt (uint64_t z[4], uint64_t k, uint64_t *x); + +/* Reduce modulo field characteristic, z := x mod p_256 */ +/* Input x[4]; output z[4] */ +extern void bignum_mod_p256_4 (uint64_t z[4], uint64_t x[4]); + +/* Reduce modulo field characteristic, z := x mod p_256k1 */ +/* Input x[4]; output z[4] */ +extern void bignum_mod_p256k1_4 (uint64_t z[4], uint64_t x[4]); + +/* Reduce modulo field characteristic, z := x mod p_384 */ +/* Input x[k]; output z[6] */ +extern void bignum_mod_p384 (uint64_t z[6], uint64_t k, uint64_t *x); +extern void bignum_mod_p384_alt (uint64_t z[6], uint64_t k, uint64_t *x); + +/* Reduce modulo field characteristic, z := x mod p_384 */ +/* Input x[6]; output z[6] */ +extern void bignum_mod_p384_6 (uint64_t z[6], uint64_t x[6]); + +/* Reduce modulo field characteristic, z := x mod p_521 */ +/* Input x[9]; output z[9] */ +extern void bignum_mod_p521_9 (uint64_t z[9], uint64_t x[9]); + +/* Reduce modulo field characteristic, z := x mod p_sm2 */ +/* Input x[k]; output z[4] */ +extern void bignum_mod_sm2 (uint64_t z[4], uint64_t k, uint64_t *x); + +/* Reduce modulo field characteristic, z := x mod p_sm2 */ +/* Input x[4]; output z[4] */ +extern void bignum_mod_sm2_4 (uint64_t z[4], uint64_t x[4]); + +/* Add modulo m, z := (x + y) mod m, assuming x and y reduced */ +/* Inputs x[k], y[k], m[k]; output z[k] */ +extern void bignum_modadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); + +/* Double modulo m, z := (2 * x) mod m, assuming x reduced */ +/* Inputs x[k], m[k]; output z[k] */ +extern void bignum_moddouble (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); + +/* Modular exponentiation for arbitrary odd modulus, z := (a^p) mod m */ +/* Inputs a[k], p[k], m[k]; output z[k], temporary buffer t[>=3*k] */ +extern void bignum_modexp(uint64_t k,uint64_t *z, uint64_t *a,uint64_t *p,uint64_t *m,uint64_t *t); + +/* Compute "modification" constant z := 2^{64k} mod m */ +/* Input m[k]; output z[k]; temporary buffer t[>=k] */ +extern void bignum_modifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t); + +/* Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, a coprime to b */ +/* Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k] */ +extern void bignum_modinv (uint64_t k, uint64_t *z, uint64_t *a, uint64_t *b, uint64_t *t); + +/* Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x (if p zero), assuming x reduced */ +/* Inputs p, x[k], m[k]; output z[k] */ +extern void bignum_modoptneg (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x, uint64_t *m); + +/* Subtract modulo m, z := (x - y) mod m, assuming x and y reduced */ +/* Inputs x[k], y[k], m[k]; output z[k] */ +extern void bignum_modsub (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); + +/* Compute "montification" constant z := 2^{128k} mod m */ +/* Input m[k]; output z[k]; temporary buffer t[>=k] */ +extern void bignum_montifier (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t); + +/* Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 */ +/* Input x[4]; output z[4] */ +extern void bignum_montinv_p256(uint64_t z[4],uint64_t x[4]); + +/* Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 */ +/* Input x[6]; output z[6] */ +extern void bignum_montinv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],uint64_t x[S2N_BIGNUM_STATIC 6]); + +/* Montgomery inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1 */ +/* Input x[4]; output z[4] */ +extern void bignum_montinv_sm2(uint64_t z[4],uint64_t x[4]); + +/* Montgomery multiply, z := (x * y / 2^{64k}) mod m */ +/* Inputs x[k], y[k], m[k]; output z[k] */ +extern void bignum_montmul (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); + +/* Montgomery multiply, z := (x * y / 2^256) mod p_256 */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_montmul_p256 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); +extern void bignum_montmul_p256_alt (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Montgomery multiply, z := (x * y / 2^256) mod p_256k1 */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_montmul_p256k1 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); +extern void bignum_montmul_p256k1_alt (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Montgomery multiply, z := (x * y / 2^384) mod p_384 */ +/* Inputs x[6], y[6]; output z[6] */ +extern void bignum_montmul_p384 (uint64_t z[6], uint64_t x[6], uint64_t y[6]); +extern void bignum_montmul_p384_alt (uint64_t z[6], uint64_t x[6], uint64_t y[6]); + +/* Montgomery multiply, z := (x * y / 2^576) mod p_521 */ +/* Inputs x[9], y[9]; output z[9] */ +extern void bignum_montmul_p521 (uint64_t z[9], uint64_t x[9], uint64_t y[9]); +extern void bignum_montmul_p521_alt (uint64_t z[9], uint64_t x[9], uint64_t y[9]); + +/* Montgomery multiply, z := (x * y / 2^256) mod p_sm2 */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_montmul_sm2 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); +extern void bignum_montmul_sm2_alt (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Montgomery reduce, z := (x' / 2^{64p}) MOD m */ +/* Inputs x[n], m[k], p; output z[k] */ +extern void bignum_montredc (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t *m, uint64_t p); + +/* Montgomery square, z := (x^2 / 2^{64k}) mod m */ +/* Inputs x[k], m[k]; output z[k] */ +extern void bignum_montsqr (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); + +/* Montgomery square, z := (x^2 / 2^256) mod p_256 */ +/* Input x[4]; output z[4] */ +extern void bignum_montsqr_p256 (uint64_t z[4], uint64_t x[4]); +extern void bignum_montsqr_p256_alt (uint64_t z[4], uint64_t x[4]); + +/* Montgomery square, z := (x^2 / 2^256) mod p_256k1 */ +/* Input x[4]; output z[4] */ +extern void bignum_montsqr_p256k1 (uint64_t z[4], uint64_t x[4]); +extern void bignum_montsqr_p256k1_alt (uint64_t z[4], uint64_t x[4]); + +/* Montgomery square, z := (x^2 / 2^384) mod p_384 */ +/* Input x[6]; output z[6] */ +extern void bignum_montsqr_p384 (uint64_t z[6], uint64_t x[6]); +extern void bignum_montsqr_p384_alt (uint64_t z[6], uint64_t x[6]); + +/* Montgomery square, z := (x^2 / 2^576) mod p_521 */ +/* Input x[9]; output z[9] */ +extern void bignum_montsqr_p521 (uint64_t z[9], uint64_t x[9]); +extern void bignum_montsqr_p521_alt (uint64_t z[9], uint64_t x[9]); + +/* Montgomery square, z := (x^2 / 2^256) mod p_sm2 */ +/* Input x[4]; output z[4] */ +extern void bignum_montsqr_sm2 (uint64_t z[4], uint64_t x[4]); +extern void bignum_montsqr_sm2_alt (uint64_t z[4], uint64_t x[4]); + +/* Multiply z := x * y */ +/* Inputs x[m], y[n]; output z[k] */ +extern void bignum_mul (uint64_t k, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); + +/* Multiply z := x * y */ +/* Inputs x[4], y[4]; output z[8] */ +extern void bignum_mul_4_8 (uint64_t z[8], uint64_t x[4], uint64_t y[4]); +extern void bignum_mul_4_8_alt (uint64_t z[8], uint64_t x[4], uint64_t y[4]); + +/* Multiply z := x * y */ +/* Inputs x[6], y[6]; output z[12] */ +extern void bignum_mul_6_12 (uint64_t z[12], uint64_t x[6], uint64_t y[6]); +extern void bignum_mul_6_12_alt (uint64_t z[12], uint64_t x[6], uint64_t y[6]); + +/* Multiply z := x * y */ +/* Inputs x[8], y[8]; output z[16] */ +extern void bignum_mul_8_16 (uint64_t z[16], uint64_t x[8], uint64_t y[8]); +extern void bignum_mul_8_16_alt (uint64_t z[16], uint64_t x[8], uint64_t y[8]); + +/* Multiply modulo p_25519, z := (x * y) mod p_25519 */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_mul_p25519 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); +extern void bignum_mul_p25519_alt (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Multiply modulo p_256k1, z := (x * y) mod p_256k1 */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_mul_p256k1 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); +extern void bignum_mul_p256k1_alt (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced */ +/* Inputs x[9], y[9]; output z[9] */ +extern void bignum_mul_p521 (uint64_t z[9], uint64_t x[9], uint64_t y[9]); +extern void bignum_mul_p521_alt (uint64_t z[9], uint64_t x[9], uint64_t y[9]); + +/* Multiply bignum by 10 and add word: z := 10 * z + d */ +/* Inputs z[k], d; outputs function return (carry) and z[k] */ +extern uint64_t bignum_muladd10 (uint64_t k, uint64_t *z, uint64_t d); + +/* Multiplex/select z := x (if p nonzero) or z := y (if p zero) */ +/* Inputs p, x[k], y[k]; output z[k] */ +extern void bignum_mux (uint64_t p, uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y); + +/* 256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) */ +/* Inputs p, x[4], y[4]; output z[4] */ +extern void bignum_mux_4 (uint64_t p, uint64_t z[4],uint64_t x[4], uint64_t y[4]); + +/* 384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) */ +/* Inputs p, x[6], y[6]; output z[6] */ +extern void bignum_mux_6 (uint64_t p, uint64_t z[6],uint64_t x[6], uint64_t y[6]); + +/* Select element from 16-element table, z := xs[k*i] */ +/* Inputs xs[16*k], i; output z[k] */ +extern void bignum_mux16 (uint64_t k, uint64_t *z, uint64_t *xs, uint64_t i); + +/* Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_neg_p25519 (uint64_t z[4], uint64_t x[4]); + +/* Negate modulo p_256, z := (-x) mod p_256, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_neg_p256 (uint64_t z[4], uint64_t x[4]); + +/* Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_neg_p256k1 (uint64_t z[4], uint64_t x[4]); + +/* Negate modulo p_384, z := (-x) mod p_384, assuming x reduced */ +/* Input x[6]; output z[6] */ +extern void bignum_neg_p384 (uint64_t z[6], uint64_t x[6]); + +/* Negate modulo p_521, z := (-x) mod p_521, assuming x reduced */ +/* Input x[9]; output z[9] */ +extern void bignum_neg_p521 (uint64_t z[9], uint64_t x[9]); + +/* Negate modulo p_sm2, z := (-x) mod p_sm2, assuming x reduced */ +/* Input x[4]; output z[4] */ +extern void bignum_neg_sm2 (uint64_t z[4], uint64_t x[4]); + +/* Negated modular inverse, z := (-1/x) mod 2^{64k} */ +/* Input x[k]; output z[k] */ +extern void bignum_negmodinv (uint64_t k, uint64_t *z, uint64_t *x); + +/* Test bignum for nonzero-ness x =/= 0 */ +/* Input x[k]; output function return */ +extern uint64_t bignum_nonzero (uint64_t k, uint64_t *x); + +/* Test 256-bit bignum for nonzero-ness x =/= 0 */ +/* Input x[4]; output function return */ +extern uint64_t bignum_nonzero_4(uint64_t x[4]); + +/* Test 384-bit bignum for nonzero-ness x =/= 0 */ +/* Input x[6]; output function return */ +extern uint64_t bignum_nonzero_6(uint64_t x[6]); + +/* Normalize bignum in-place by shifting left till top bit is 1 */ +/* Input z[k]; outputs function return (bits shifted left) and z[k] */ +extern uint64_t bignum_normalize (uint64_t k, uint64_t *z); + +/* Test bignum for odd-ness */ +/* Input x[k]; output function return */ +extern uint64_t bignum_odd (uint64_t k, uint64_t *x); + +/* Convert single digit to bignum, z := n */ +/* Input n; output z[k] */ +extern void bignum_of_word (uint64_t k, uint64_t *z, uint64_t n); + +/* Optionally add, z := x + y (if p nonzero) or z := x (if p zero) */ +/* Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] */ +extern uint64_t bignum_optadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y); + +/* Optionally negate, z := -x (if p nonzero) or z := x (if p zero) */ +/* Inputs p, x[k]; outputs function return (nonzero input) and z[k] */ +extern uint64_t bignum_optneg (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x); + +/* Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or z := x (if p zero), assuming x reduced */ +/* Inputs p, x[4]; output z[4] */ +extern void bignum_optneg_p25519 (uint64_t z[4], uint64_t p, uint64_t x[4]); + +/* Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or z := x (if p zero), assuming x reduced */ +/* Inputs p, x[4]; output z[4] */ +extern void bignum_optneg_p256 (uint64_t z[4], uint64_t p, uint64_t x[4]); + +/* Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or z := x (if p zero), assuming x reduced */ +/* Inputs p, x[4]; output z[4] */ +extern void bignum_optneg_p256k1 (uint64_t z[4], uint64_t p, uint64_t x[4]); + +/* Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or z := x (if p zero), assuming x reduced */ +/* Inputs p, x[6]; output z[6] */ +extern void bignum_optneg_p384 (uint64_t z[6], uint64_t p, uint64_t x[6]); + +/* Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or z := x (if p zero), assuming x reduced */ +/* Inputs p, x[9]; output z[9] */ +extern void bignum_optneg_p521 (uint64_t z[9], uint64_t p, uint64_t x[9]); + +/* Optionally negate modulo p_sm2, z := (-x) mod p_sm2 (if p nonzero) or z := x (if p zero), assuming x reduced */ +/* Inputs p, x[4]; output z[4] */ +extern void bignum_optneg_sm2 (uint64_t z[4], uint64_t p, uint64_t x[4]); + +/* Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero) */ +/* Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] */ +extern uint64_t bignum_optsub (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y); + +/* Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed */ +/* Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] */ +extern uint64_t bignum_optsubadd (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y); + +/* Return bignum of power of 2, z := 2^n */ +/* Input n; output z[k] */ +extern void bignum_pow2 (uint64_t k, uint64_t *z, uint64_t n); + +/* Shift bignum left by c < 64 bits z := x * 2^c */ +/* Inputs x[n], c; outputs function return (carry-out) and z[k] */ +extern uint64_t bignum_shl_small (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c); + +/* Shift bignum right by c < 64 bits z := floor(x / 2^c) */ +/* Inputs x[n], c; outputs function return (bits shifted out) and z[k] */ +extern uint64_t bignum_shr_small (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c); + +/* Square, z := x^2 */ +/* Input x[n]; output z[k] */ +extern void bignum_sqr (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x); + +/* Square, z := x^2 */ +/* Input x[4]; output z[8] */ +extern void bignum_sqr_4_8 (uint64_t z[8], uint64_t x[4]); +extern void bignum_sqr_4_8_alt (uint64_t z[8], uint64_t x[4]); + +/* Square, z := x^2 */ +/* Input x[6]; output z[12] */ +extern void bignum_sqr_6_12 (uint64_t z[12], uint64_t x[6]); +extern void bignum_sqr_6_12_alt (uint64_t z[12], uint64_t x[6]); + +/* Square, z := x^2 */ +/* Input x[8]; output z[16] */ +extern void bignum_sqr_8_16 (uint64_t z[16], uint64_t x[8]); +extern void bignum_sqr_8_16_alt (uint64_t z[16], uint64_t x[8]); + +/* Square modulo p_25519, z := (x^2) mod p_25519 */ +/* Input x[4]; output z[4] */ +extern void bignum_sqr_p25519 (uint64_t z[4], uint64_t x[4]); +extern void bignum_sqr_p25519_alt (uint64_t z[4], uint64_t x[4]); + +/* Square modulo p_256k1, z := (x^2) mod p_256k1 */ +/* Input x[4]; output z[4] */ +extern void bignum_sqr_p256k1 (uint64_t z[4], uint64_t x[4]); +extern void bignum_sqr_p256k1_alt (uint64_t z[4], uint64_t x[4]); + +/* Square modulo p_521, z := (x^2) mod p_521, assuming x reduced */ +/* Input x[9]; output z[9] */ +extern void bignum_sqr_p521 (uint64_t z[9], uint64_t x[9]); +extern void bignum_sqr_p521_alt (uint64_t z[9], uint64_t x[9]); + +/* Square root modulo p_25519 */ +/* Input x[4]; output function return (Legendre symbol) and z[4] */ +extern int64_t bignum_sqrt_p25519(uint64_t z[4],uint64_t x[4]); +extern int64_t bignum_sqrt_p25519_alt(uint64_t z[4],uint64_t x[4]); + +/* Subtract, z := x - y */ +/* Inputs x[m], y[n]; outputs function return (carry-out) and z[p] */ +extern uint64_t bignum_sub (uint64_t p, uint64_t *z, uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); + +/* Subtract modulo p_25519, z := (x - y) mod p_25519, assuming x and y reduced */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_sub_p25519 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Subtract modulo p_256, z := (x - y) mod p_256, assuming x and y reduced */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_sub_p256 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Subtract modulo p_256k1, z := (x - y) mod p_256k1, assuming x and y reduced */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_sub_p256k1 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Subtract modulo p_384, z := (x - y) mod p_384, assuming x and y reduced */ +/* Inputs x[6], y[6]; output z[6] */ +extern void bignum_sub_p384 (uint64_t z[6], uint64_t x[6], uint64_t y[6]); + +/* Subtract modulo p_521, z := (x - y) mod p_521, assuming x and y reduced */ +/* Inputs x[9], y[9]; output z[9] */ +extern void bignum_sub_p521 (uint64_t z[9], uint64_t x[9], uint64_t y[9]); + +/* Subtract modulo p_sm2, z := (x - y) mod p_sm2, assuming x and y reduced */ +/* Inputs x[4], y[4]; output z[4] */ +extern void bignum_sub_sm2 (uint64_t z[4], uint64_t x[4], uint64_t y[4]); + +/* Convert 4-digit (256-bit) bignum to big-endian bytes */ +/* Input x[4]; output z[32] (bytes) */ +extern void bignum_tobebytes_4 (uint8_t z[32], uint64_t x[4]); + +/* Convert 6-digit (384-bit) bignum to big-endian bytes */ +/* Input x[6]; output z[48] (bytes) */ +extern void bignum_tobebytes_6 (uint8_t z[48], uint64_t x[6]); + +/* Convert 4-digit (256-bit) bignum to little-endian bytes */ +/* Input x[4]; output z[32] (bytes) */ +extern void bignum_tolebytes_4 (uint8_t z[32], uint64_t x[4]); + +/* Convert 6-digit (384-bit) bignum to little-endian bytes */ +/* Input x[6]; output z[48] (bytes) */ +extern void bignum_tolebytes_6 (uint8_t z[48], uint64_t x[6]); + +/* Convert 9-digit 528-bit bignum to little-endian bytes */ +/* Input x[6]; output z[66] (bytes) */ +extern void bignum_tolebytes_p521 (uint8_t z[66], uint64_t x[9]); + +/* Convert to Montgomery form z := (2^256 * x) mod p_256 */ +/* Input x[4]; output z[4] */ +extern void bignum_tomont_p256 (uint64_t z[4], uint64_t x[4]); +extern void bignum_tomont_p256_alt (uint64_t z[4], uint64_t x[4]); + +/* Convert to Montgomery form z := (2^256 * x) mod p_256k1 */ +/* Input x[4]; output z[4] */ +extern void bignum_tomont_p256k1 (uint64_t z[4], uint64_t x[4]); +extern void bignum_tomont_p256k1_alt (uint64_t z[4], uint64_t x[4]); + +/* Convert to Montgomery form z := (2^384 * x) mod p_384 */ +/* Input x[6]; output z[6] */ +extern void bignum_tomont_p384 (uint64_t z[6], uint64_t x[6]); +extern void bignum_tomont_p384_alt (uint64_t z[6], uint64_t x[6]); + +/* Convert to Montgomery form z := (2^576 * x) mod p_521 */ +/* Input x[9]; output z[9] */ +extern void bignum_tomont_p521 (uint64_t z[9], uint64_t x[9]); + +/* Convert to Montgomery form z := (2^256 * x) mod p_sm2 */ +/* Input x[4]; output z[4] */ +extern void bignum_tomont_sm2 (uint64_t z[4], uint64_t x[4]); + +/* Triple modulo p_256, z := (3 * x) mod p_256 */ +/* Input x[4]; output z[4] */ +extern void bignum_triple_p256 (uint64_t z[4], uint64_t x[4]); +extern void bignum_triple_p256_alt (uint64_t z[4], uint64_t x[4]); + +/* Triple modulo p_256k1, z := (3 * x) mod p_256k1 */ +/* Input x[4]; output z[4] */ +extern void bignum_triple_p256k1 (uint64_t z[4], uint64_t x[4]); +extern void bignum_triple_p256k1_alt (uint64_t z[4], uint64_t x[4]); + +/* Triple modulo p_384, z := (3 * x) mod p_384 */ +/* Input x[6]; output z[6] */ +extern void bignum_triple_p384 (uint64_t z[6], uint64_t x[6]); +extern void bignum_triple_p384_alt (uint64_t z[6], uint64_t x[6]); + +/* Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced */ +/* Input x[9]; output z[9] */ +extern void bignum_triple_p521 (uint64_t z[9], uint64_t x[9]); +extern void bignum_triple_p521_alt (uint64_t z[9], uint64_t x[9]); + +/* Triple modulo p_sm2, z := (3 * x) mod p_sm2 */ +/* Input x[4]; output z[4] */ +extern void bignum_triple_sm2 (uint64_t z[4], uint64_t x[4]); +extern void bignum_triple_sm2_alt (uint64_t z[4], uint64_t x[4]); + +/* Montgomery ladder step for curve25519 */ +/* Inputs point[8], pp[16], b; output rr[16] */ +extern void curve25519_ladderstep(uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b); +extern void curve25519_ladderstep_alt(uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b); + +/* Projective scalar multiplication, x coordinate only, for curve25519 */ +/* Inputs scalar[4], point[4]; output res[8] */ +extern void curve25519_pxscalarmul(uint64_t res[8],uint64_t scalar[4],uint64_t point[4]); +extern void curve25519_pxscalarmul_alt(uint64_t res[8],uint64_t scalar[4],uint64_t point[4]); + +/* x25519 function for curve25519 */ +/* Inputs scalar[4], point[4]; output res[4] */ +extern void curve25519_x25519(uint64_t res[4],uint64_t scalar[4],uint64_t point[4]); +extern void curve25519_x25519_alt(uint64_t res[4],uint64_t scalar[4],uint64_t point[4]); + +/* x25519 function for curve25519 (byte array arguments) */ +/* Inputs scalar[32] (bytes), point[32] (bytes); output res[32] (bytes) */ +extern void curve25519_x25519_byte(uint8_t res[32],uint8_t scalar[32],uint8_t point[32]); +extern void curve25519_x25519_byte_alt(uint8_t res[32],uint8_t scalar[32],uint8_t point[32]); + +/* x25519 function for curve25519 on base element 9 */ +/* Input scalar[4]; output res[4] */ +extern void curve25519_x25519base(uint64_t res[4],uint64_t scalar[4]); +extern void curve25519_x25519base_alt(uint64_t res[4],uint64_t scalar[4]); + +/* x25519 function for curve25519 on base element 9 (byte array arguments) */ +/* Input scalar[32] (bytes); output res[32] (bytes) */ +extern void curve25519_x25519base_byte(uint8_t res[32],uint8_t scalar[32]); +extern void curve25519_x25519base_byte_alt(uint8_t res[32],uint8_t scalar[32]); + +/* Decode compressed 256-bit form of edwards25519 point */ +/* Input c[32] (bytes); output function return and z[8] */ +extern uint64_t edwards25519_decode(uint64_t z[8],uint8_t c[32]); +extern uint64_t edwards25519_decode_alt(uint64_t z[8],uint8_t c[32]); + +/* Encode edwards25519 point into compressed form as 256-bit number */ +/* Input p[8]; output z[32] (bytes) */ +extern void edwards25519_encode(uint8_t z[32], uint64_t p[8]); + +/* Extended projective addition for edwards25519 */ +/* Inputs p1[16], p2[16]; output p3[16] */ +extern void edwards25519_epadd(uint64_t p3[16],uint64_t p1[16],uint64_t p2[16]); +extern void edwards25519_epadd_alt(uint64_t p3[16],uint64_t p1[16],uint64_t p2[16]); + +/* Extended projective doubling for edwards25519 */ +/* Inputs p1[12]; output p3[16] */ +extern void edwards25519_epdouble(uint64_t p3[16],uint64_t p1[12]); +extern void edwards25519_epdouble_alt(uint64_t p3[16],uint64_t p1[12]); + +/* Projective doubling for edwards25519 */ +/* Inputs p1[12]; output p3[12] */ +extern void edwards25519_pdouble(uint64_t p3[12],uint64_t p1[12]); +extern void edwards25519_pdouble_alt(uint64_t p3[12],uint64_t p1[12]); + +/* Extended projective + precomputed mixed addition for edwards25519 */ +/* Inputs p1[16], p2[12]; output p3[16] */ +extern void edwards25519_pepadd(uint64_t p3[16],uint64_t p1[16],uint64_t p2[12]); +extern void edwards25519_pepadd_alt(uint64_t p3[16],uint64_t p1[16],uint64_t p2[12]); + +/* Scalar multiplication by standard basepoint for edwards25519 (Ed25519) */ +/* Input scalar[4]; output res[8] */ +extern void edwards25519_scalarmulbase(uint64_t res[8],uint64_t scalar[4]); +extern void edwards25519_scalarmulbase_alt(uint64_t res[8],uint64_t scalar[4]); + +/* Double scalar multiplication for edwards25519, fresh and base point */ +/* Input scalar[4], point[8], bscalar[4]; output res[8] */ +extern void edwards25519_scalarmuldouble(uint64_t res[8],uint64_t scalar[4], uint64_t point[8],uint64_t bscalar[4]); +extern void edwards25519_scalarmuldouble_alt(uint64_t res[8],uint64_t scalar[4], uint64_t point[8],uint64_t bscalar[4]); + +/* Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates */ +/* Inputs p1[12], p2[12]; output p3[12] */ +extern void p256_montjadd(uint64_t p3[12],uint64_t p1[12],uint64_t p2[12]); +extern void p256_montjadd_alt(uint64_t p3[12],uint64_t p1[12],uint64_t p2[12]); + +/* Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates */ +/* Inputs p1[12]; output p3[12] */ +extern void p256_montjdouble(uint64_t p3[12],uint64_t p1[12]); +extern void p256_montjdouble_alt(uint64_t p3[12],uint64_t p1[12]); + +/* Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates */ +/* Inputs p1[12], p2[8]; output p3[12] */ +extern void p256_montjmixadd(uint64_t p3[12],uint64_t p1[12],uint64_t p2[8]); +extern void p256_montjmixadd_alt(uint64_t p3[12],uint64_t p1[12],uint64_t p2[8]); + +/* Montgomery-Jacobian form scalar multiplication for P-256 */ +/* Input scalar[4], point[12]; output res[12] */ +extern void p256_montjscalarmul(uint64_t res[12],uint64_t scalar[4],uint64_t point[12]); +extern void p256_montjscalarmul_alt(uint64_t res[12],uint64_t scalar[4],uint64_t point[12]); + +/* Scalar multiplication for NIST curve P-256 */ +/* Input scalar[4], point[8]; output res[8] */ +extern void p256_scalarmul(uint64_t res[8],uint64_t scalar[4],uint64_t point[8]); +extern void p256_scalarmul_alt(uint64_t res[8],uint64_t scalar[4],uint64_t point[8]); + +/* Scalar multiplication for precomputed point on NIST curve P-256 */ +/* Input scalar[4], blocksize, table[]; output res[8] */ +extern void p256_scalarmulbase(uint64_t res[8],uint64_t scalar[4],uint64_t blocksize,uint64_t *table); +extern void p256_scalarmulbase_alt(uint64_t res[8],uint64_t scalar[4],uint64_t blocksize,uint64_t *table); + +/* Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates */ +/* Inputs p1[18], p2[18]; output p3[18] */ +extern void p384_montjadd(uint64_t p3[18],uint64_t p1[18],uint64_t p2[18]); +extern void p384_montjadd_alt(uint64_t p3[18],uint64_t p1[18],uint64_t p2[18]); + +/* Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates */ +/* Inputs p1[18]; output p3[18] */ +extern void p384_montjdouble(uint64_t p3[18],uint64_t p1[18]); +extern void p384_montjdouble_alt(uint64_t p3[18],uint64_t p1[18]); + +/* Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates */ +/* Inputs p1[18], p2[12]; output p3[18] */ +extern void p384_montjmixadd(uint64_t p3[18],uint64_t p1[18],uint64_t p2[12]); +extern void p384_montjmixadd_alt(uint64_t p3[18],uint64_t p1[18],uint64_t p2[12]); + +/* Montgomery-Jacobian form scalar multiplication for P-384 */ +/* Input scalar[6], point[18]; output res[18] */ +extern void p384_montjscalarmul(uint64_t res[18],uint64_t scalar[6],uint64_t point[18]); +extern void p384_montjscalarmul_alt(uint64_t res[18],uint64_t scalar[6],uint64_t point[18]); + +/* Point addition on NIST curve P-521 in Jacobian coordinates */ +/* Inputs p1[27], p2[27]; output p3[27] */ +extern void p521_jadd(uint64_t p3[27],uint64_t p1[27],uint64_t p2[27]); +extern void p521_jadd_alt(uint64_t p3[27],uint64_t p1[27],uint64_t p2[27]); + +/* Point doubling on NIST curve P-521 in Jacobian coordinates */ +/* Input p1[27]; output p3[27] */ +extern void p521_jdouble(uint64_t p3[27],uint64_t p1[27]); +extern void p521_jdouble_alt(uint64_t p3[27],uint64_t p1[27]); + +/* Point mixed addition on NIST curve P-521 in Jacobian coordinates */ +/* Inputs p1[27], p2[18]; output p3[27] */ +extern void p521_jmixadd(uint64_t p3[27],uint64_t p1[27],uint64_t p2[18]); +extern void p521_jmixadd_alt(uint64_t p3[27],uint64_t p1[27],uint64_t p2[18]); + +/* Jacobian form scalar multiplication for P-521 */ +/* Input scalar[9], point[27]; output res[27] */ +extern void p521_jscalarmul(uint64_t res[27],uint64_t scalar[9],uint64_t point[27]); +extern void p521_jscalarmul_alt(uint64_t res[27],uint64_t scalar[9],uint64_t point[27]); + +/* Point addition on SECG curve secp256k1 in Jacobian coordinates */ +/* Inputs p1[12], p2[12]; output p3[12] */ +extern void secp256k1_jadd(uint64_t p3[12],uint64_t p1[12],uint64_t p2[12]); +extern void secp256k1_jadd_alt(uint64_t p3[12],uint64_t p1[12],uint64_t p2[12]); + +/* Point doubling on SECG curve secp256k1 in Jacobian coordinates */ +/* Input p1[12]; output p3[12] */ +extern void secp256k1_jdouble(uint64_t p3[12],uint64_t p1[12]); +extern void secp256k1_jdouble_alt(uint64_t p3[12],uint64_t p1[12]); + +/* Point mixed addition on SECG curve secp256k1 in Jacobian coordinates */ +/* Inputs p1[12], p2[8]; output p3[12] */ +extern void secp256k1_jmixadd(uint64_t p3[12],uint64_t p1[12],uint64_t p2[8]); +extern void secp256k1_jmixadd_alt(uint64_t p3[12],uint64_t p1[12],uint64_t p2[8]); + +/* Point addition on CC curve SM2 in Montgomery-Jacobian coordinates */ +/* Inputs p1[12], p2[12]; output p3[12] */ +extern void sm2_montjadd(uint64_t p3[12],uint64_t p1[12],uint64_t p2[12]); +extern void sm2_montjadd_alt(uint64_t p3[12],uint64_t p1[12],uint64_t p2[12]); + +/* Point doubling on CC curve SM2 in Montgomery-Jacobian coordinates */ +/* Inputs p1[12]; output p3[12] */ +extern void sm2_montjdouble(uint64_t p3[12],uint64_t p1[12]); +extern void sm2_montjdouble_alt(uint64_t p3[12],uint64_t p1[12]); + +/* Point mixed addition on CC curve SM2 in Montgomery-Jacobian coordinates */ +/* Inputs p1[12], p2[8]; output p3[12] */ +extern void sm2_montjmixadd(uint64_t p3[12],uint64_t p1[12],uint64_t p2[8]); +extern void sm2_montjmixadd_alt(uint64_t p3[12],uint64_t p1[12],uint64_t p2[8]); + +/* Montgomery-Jacobian form scalar multiplication for CC curve SM2 */ +/* Input scalar[4], point[12]; output res[12] */ +extern void sm2_montjscalarmul(uint64_t res[12],uint64_t scalar[4],uint64_t point[12]); +extern void sm2_montjscalarmul_alt(uint64_t res[12],uint64_t scalar[4],uint64_t point[12]); + +/* Reverse the bytes in a single word */ +/* Input a; output function return */ +extern uint64_t word_bytereverse (uint64_t a); + +/* Count leading zero bits in a single word */ +/* Input a; output function return */ +extern uint64_t word_clz (uint64_t a); + +/* Count trailing zero bits in a single word */ +/* Input a; output function return */ +extern uint64_t word_ctz (uint64_t a); + +/* Perform 59 "divstep" iterations and return signed matrix of updates */ +/* Inputs d, f, g; output m[2][2] and function return */ +extern int64_t word_divstep59(int64_t m[2][2],int64_t d,uint64_t f,uint64_t g); + +/* Return maximum of two unsigned 64-bit words */ +/* Inputs a, b; output function return */ +extern uint64_t word_max (uint64_t a, uint64_t b); + +/* Return minimum of two unsigned 64-bit words */ +/* Inputs a, b; output function return */ +extern uint64_t word_min (uint64_t a, uint64_t b); + +/* Single-word negated modular inverse (-1/a) mod 2^64 */ +/* Input a; output function return */ +extern uint64_t word_negmodinv (uint64_t a); + +/* Count number of set bits in a single 64-bit word (population count) */ +/* Input a; output function return */ +extern uint64_t word_popcount (uint64_t a); + +/* Single-word reciprocal, 2^64 + ret = ceil(2^128/a) - 1 if MSB of "a" is set */ +/* Input a; output function return */ +extern uint64_t word_recip (uint64_t a); diff --git a/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h b/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h new file mode 100644 index 00000000000..faecfec52a2 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/include/s2n-bignum.h @@ -0,0 +1,1120 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// C prototypes for s2n-bignum functions, so you can use them in C programs via +// +// #include "s2n-bignum.h" +// +// The functions are listed in alphabetical order with a brief description +// in comments for each one. For more detailed documentation see the comment +// banner at the top of the corresponding assembly (.S) file, and +// for the last word in what properties it satisfies see the spec in the +// formal proof (the .ml file in the architecture-specific directory). +// +// For some functions there are additional variants with names ending in +// "_alt". These have the same core mathematical functionality as their +// non-"alt" versions, but can be better suited to some microarchitectures: +// +// - On x86, the "_alt" forms avoid BMI and ADX instruction set +// extensions, so will run on any x86_64 machine, even older ones +// +// - On ARM, the "_alt" forms target machines with higher multiplier +// throughput, generally offering higher performance there. +// ---------------------------------------------------------------------------- + + +#if defined(_MSC_VER) || !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L || defined(__STDC_NO_VLA__) +#define S2N_BIGNUM_STATIC +#else +#define S2N_BIGNUM_STATIC static +#endif + +// Add, z := x + y +// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] +extern uint64_t bignum_add (uint64_t p, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y); + +// Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +extern void bignum_add_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +extern void bignum_add_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +extern void bignum_add_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced +// Inputs x[6], y[6]; output z[6] +extern void bignum_add_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]); + +// Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced +// Inputs x[9], y[9]; output z[9] +extern void bignum_add_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]); + +// Add modulo p_sm2, z := (x + y) mod p_sm2, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +extern void bignum_add_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Compute "amontification" constant z :== 2^{128k} (congruent mod m) +// Input m[k]; output z[k]; temporary buffer t[>=k] +extern void bignum_amontifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t); + +// Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m) +// Inputs x[k], y[k], m[k]; output z[k] +extern void bignum_amontmul (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m); + +// Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m) +// Inputs x[n], m[k], p; output z[k] +extern void bignum_amontredc (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, const uint64_t *m, uint64_t p); + +// Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m) +// Inputs x[k], m[k]; output z[k] +extern void bignum_amontsqr (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m); + +// Convert 4-digit (256-bit) bignum to/from big-endian form +// Input x[4]; output z[4] +extern void bignum_bigendian_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Convert 6-digit (384-bit) bignum to/from big-endian form +// Input x[6]; output z[6] +extern void bignum_bigendian_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Select bitfield starting at bit n with length l <= 64 +// Inputs x[k], n, l; output function return +extern uint64_t bignum_bitfield (uint64_t k, const uint64_t *x, uint64_t n, uint64_t l); + +// Return size of bignum in bits +// Input x[k]; output function return +extern uint64_t bignum_bitsize (uint64_t k, const uint64_t *x); + +// Divide by a single (nonzero) word, z := x / m and return x mod m +// Inputs x[n], m; outputs function return (remainder) and z[k] +extern uint64_t bignum_cdiv (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t m); + +// Divide by a single word, z := x / m when known to be exact +// Inputs x[n], m; output z[k] +extern void bignum_cdiv_exact (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t m); + +// Count leading zero digits (64-bit words) +// Input x[k]; output function return +extern uint64_t bignum_cld (uint64_t k, const uint64_t *x); + +// Count leading zero bits +// Input x[k]; output function return +extern uint64_t bignum_clz (uint64_t k, const uint64_t *x); + +// Multiply-add with single-word multiplier, z := z + c * y +// Inputs c, y[n]; outputs function return (carry-out) and z[k] +extern uint64_t bignum_cmadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y); + +// Negated multiply-add with single-word multiplier, z := z - c * y +// Inputs c, y[n]; outputs function return (negative carry-out) and z[k] +extern uint64_t bignum_cmnegadd (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y); + +// Find modulus of bignum w.r.t. single nonzero word m, returning x mod m +// Input x[k], m; output function return +extern uint64_t bignum_cmod (uint64_t k, const uint64_t *x, uint64_t m); + +// Multiply by a single word, z := c * y +// Inputs c, y[n]; outputs function return (carry-out) and z[k] +extern uint64_t bignum_cmul (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, const uint64_t *y); + +// Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming x reduced +// Inputs c, x[4]; output z[4] +extern void bignum_cmul_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_cmul_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming x reduced +// Inputs c, x[4]; output z[4] +extern void bignum_cmul_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_cmul_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming x reduced +// Inputs c, x[4]; output z[4] +extern void bignum_cmul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_cmul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming x reduced +// Inputs c, x[6]; output z[6] +extern void bignum_cmul_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 6]); +extern void bignum_cmul_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming x reduced +// Inputs c, x[9]; output z[9] +extern void bignum_cmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 9]); +extern void bignum_cmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming x reduced +// Inputs c, x[4]; output z[4] +extern void bignum_cmul_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_cmul_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t c, const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Test bignums for coprimality, gcd(x,y) = 1 +// Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)] +extern uint64_t bignum_coprime (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y, uint64_t *t); + +// Copy bignum with zero-extension or truncation, z := x +// Input x[n]; output z[k] +extern void bignum_copy (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x); + +// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] +// into z[0..width-1]. +// This function is constant-time with respect to the value of `idx`. This is +// achieved by reading the whole table and using the bit-masking to get the +// `idx`-th row. +// Input table[height*width]; output z[width] +extern void bignum_copy_row_from_table (uint64_t *z, const uint64_t *table, uint64_t height, + uint64_t width, uint64_t idx); + +// Given table: uint64_t[height*width], copy table[idx*width...(idx+1)*width-1] +// into z[0..width-1]. width must be a multiple of 8. +// This function is constant-time with respect to the value of `idx`. This is +// achieved by reading the whole table and using the bit-masking to get the +// `idx`-th row. +// Input table[height*width]; output z[width] +extern void bignum_copy_row_from_table_8n (uint64_t *z, const uint64_t *table, + uint64_t height, uint64_t width, uint64_t idx); + +// Given table: uint64_t[height*16], copy table[idx*16...(idx+1)*16-1] into z[0..row-1]. +// This function is constant-time with respect to the value of `idx`. This is +// achieved by reading the whole table and using the bit-masking to get the +// `idx`-th row. +// Input table[height*16]; output z[16] +extern void bignum_copy_row_from_table_16 (uint64_t *z, const uint64_t *table, + uint64_t height, uint64_t idx); + +// Given table: uint64_t[height*32], copy table[idx*32...(idx+1)*32-1] into z[0..row-1]. +// This function is constant-time with respect to the value of `idx`. This is +// achieved by reading the whole table and using the bit-masking to get the +// `idx`-th row. +// Input table[height*32]; output z[32] +extern void bignum_copy_row_from_table_32 (uint64_t *z, const uint64_t *table, + uint64_t height, uint64_t idx); + +// Count trailing zero digits (64-bit words) +// Input x[k]; output function return +extern uint64_t bignum_ctd (uint64_t k, const uint64_t *x); + +// Count trailing zero bits +// Input x[k]; output function return +extern uint64_t bignum_ctz (uint64_t k, const uint64_t *x); + +// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256 +// Input x[4]; output z[4] +extern void bignum_deamont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_deamont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256k1 +// Input x[4]; output z[4] +extern void bignum_deamont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Convert from almost-Montgomery form, z := (x / 2^384) mod p_384 +// Input x[6]; output z[6] +extern void bignum_deamont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); +extern void bignum_deamont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Convert from almost-Montgomery form z := (x / 2^576) mod p_521 +// Input x[9]; output z[9] +extern void bignum_deamont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Convert from almost-Montgomery form z := (x / 2^256) mod p_sm2 +// Input x[4]; output z[4] +extern void bignum_deamont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m +// Inputs x[k], m[k]; output z[k] +extern void bignum_demont (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m); + +// Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_demont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_demont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Convert from Montgomery form z := (x / 2^256) mod p_256k1, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_demont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced +// Input x[6]; output z[6] +extern void bignum_demont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); +extern void bignum_demont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced +// Input x[9]; output z[9] +extern void bignum_demont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Convert from Montgomery form z := (x / 2^256) mod p_sm2, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_demont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Select digit x[n] +// Inputs x[k], n; output function return +extern uint64_t bignum_digit (uint64_t k, const uint64_t *x, uint64_t n); + +// Return size of bignum in digits (64-bit word) +// Input x[k]; output function return +extern uint64_t bignum_digitsize (uint64_t k, const uint64_t *x); + +// Divide bignum by 10: z' := z div 10, returning remainder z mod 10 +// Inputs z[k]; outputs function return (remainder) and z[k] +extern uint64_t bignum_divmod10 (uint64_t k, uint64_t *z); + +// Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_double_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_double_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_double_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced +// Input x[6]; output z[6] +extern void bignum_double_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced +// Input x[9]; output z[9] +extern void bignum_double_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Double modulo p_sm2, z := (2 * x) mod p_sm2, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_double_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Extended Montgomery reduce, returning results in input-output buffer +// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] +extern uint64_t bignum_emontredc (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t w); + +// Extended Montgomery reduce in 8-digit blocks, results in input-output buffer +// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] +extern uint64_t bignum_emontredc_8n (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t w); +// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] +// Temporary buffer m_precalc[12*(k/4-1)] +extern uint64_t bignum_emontredc_8n_cdiff (uint64_t k, uint64_t *z, const uint64_t *m, + uint64_t w, uint64_t *m_precalc); + +// Test bignums for equality, x = y +// Inputs x[m], y[n]; output function return +extern uint64_t bignum_eq (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y); + +// Test bignum for even-ness +// Input x[k]; output function return +extern uint64_t bignum_even (uint64_t k, const uint64_t *x); + +// Convert 4-digit (256-bit) bignum from big-endian bytes +// Input x[32] (bytes); output z[4] +extern void bignum_frombebytes_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint8_t x[S2N_BIGNUM_STATIC 32]); + +// Convert 6-digit (384-bit) bignum from big-endian bytes +// Input x[48] (bytes); output z[6] +extern void bignum_frombebytes_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]); + +// Convert 4-digit (256-bit) bignum from little-endian bytes +// Input x[32] (bytes); output z[4] +extern void bignum_fromlebytes_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint8_t x[S2N_BIGNUM_STATIC 32]); + +// Convert 6-digit (384-bit) bignum from little-endian bytes +// Input x[48] (bytes); output z[6] +extern void bignum_fromlebytes_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]); + +// Convert little-endian bytes to 9-digit 528-bit bignum +// Input x[66] (bytes); output z[9] +extern void bignum_fromlebytes_p521 (uint64_t z[S2N_BIGNUM_STATIC 9],const uint8_t x[S2N_BIGNUM_STATIC 66]); + +// Compare bignums, x >= y +// Inputs x[m], y[n]; output function return +extern uint64_t bignum_ge (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y); + +// Compare bignums, x > y +// Inputs x[m], y[n]; output function return +extern uint64_t bignum_gt (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y); + +// Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_half_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_half_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced +// Input x[6]; output z[6] +extern void bignum_half_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced +// Input x[9]; output z[9] +extern void bignum_half_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Halve modulo p_sm2, z := (x / 2) mod p_sm2, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_half_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Modular inverse modulo p_25519 = 2^255 - 19 +// Input x[4]; output z[4] +extern void bignum_inv_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Modular inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 +// Input x[4]; output z[4] +extern void bignum_inv_p256(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Modular inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 +// Input x[6]; output z[6] +extern void bignum_inv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Modular inverse modulo p_521 = 2^521 - 1 +// Input x[9]; output z[9] +extern void bignum_inv_p521(uint64_t z[S2N_BIGNUM_STATIC 9],const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Modular inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1 +// Input x[4]; output z[4] +extern void bignum_inv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Inverse square root modulo p_25519 +// Input x[4]; output function return (Legendre symbol) and z[4] +extern int64_t bignum_invsqrt_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern int64_t bignum_invsqrt_p25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Test bignum for zero-ness, x = 0 +// Input x[k]; output function return +extern uint64_t bignum_iszero (uint64_t k, const uint64_t *x); + +// Multiply z := x * y +// Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32] +extern void bignum_kmul_16_32 (uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], const uint64_t y[S2N_BIGNUM_STATIC 16], uint64_t t[S2N_BIGNUM_STATIC 32]); + +// Multiply z := x * y +// Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96] +extern void bignum_kmul_32_64 (uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], const uint64_t y[S2N_BIGNUM_STATIC 32], uint64_t t[S2N_BIGNUM_STATIC 96]); + +// Square, z := x^2 +// Input x[16]; output z[32]; temporary buffer t[>=24] +extern void bignum_ksqr_16_32 (uint64_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 16], uint64_t t[S2N_BIGNUM_STATIC 24]); + +// Square, z := x^2 +// Input x[32]; output z[64]; temporary buffer t[>=72] +extern void bignum_ksqr_32_64 (uint64_t z[S2N_BIGNUM_STATIC 64], const uint64_t x[S2N_BIGNUM_STATIC 32], uint64_t t[S2N_BIGNUM_STATIC 72]); + +// Compare bignums, x <= y +// Inputs x[m], y[n]; output function return +extern uint64_t bignum_le (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y); + +// Convert 4-digit (256-bit) bignum to/from little-endian form +// Input x[4]; output z[4] +extern void bignum_littleendian_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Convert 6-digit (384-bit) bignum to/from little-endian form +// Input x[6]; output z[6] +extern void bignum_littleendian_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Compare bignums, x < y +// Inputs x[m], y[n]; output function return +extern uint64_t bignum_lt (uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y); + +// Multiply-add, z := z + x * y +// Inputs x[m], y[n]; outputs function return (carry-out) and z[k] +extern uint64_t bignum_madd (uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y); + +// Multiply-add modulo the order of the curve25519/edwards25519 basepoint +// Inputs x[4], y[4], c[4]; output z[4] +extern void bignum_madd_n25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4], const uint64_t c[S2N_BIGNUM_STATIC 4]); +extern void bignum_madd_n25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4], const uint64_t c[S2N_BIGNUM_STATIC 4]); + +// Reduce modulo group order, z := x mod m_25519 +// Input x[4]; output z[4] +extern void bignum_mod_m25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Reduce modulo basepoint order, z := x mod n_25519 +// Input x[k]; output z[4] +extern void bignum_mod_n25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x); + +// Reduce modulo basepoint order, z := x mod n_25519 +// Input x[4]; output z[4] +extern void bignum_mod_n25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Reduce modulo group order, z := x mod n_256 +// Input x[k]; output z[4] +extern void bignum_mod_n256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x); +extern void bignum_mod_n256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x); + +// Reduce modulo group order, z := x mod n_256 +// Input x[4]; output z[4] +extern void bignum_mod_n256_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Reduce modulo group order, z := x mod n_256k1 +// Input x[4]; output z[4] +extern void bignum_mod_n256k1_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Reduce modulo group order, z := x mod n_384 +// Input x[k]; output z[6] +extern void bignum_mod_n384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x); +extern void bignum_mod_n384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x); + +// Reduce modulo group order, z := x mod n_384 +// Input x[6]; output z[6] +extern void bignum_mod_n384_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Reduce modulo group order, z := x mod n_521 +// Input x[9]; output z[9] +extern void bignum_mod_n521_9 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); +extern void bignum_mod_n521_9_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Reduce modulo group order, z := x mod n_sm2 +// Input x[k]; output z[4] +extern void bignum_mod_nsm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x); +extern void bignum_mod_nsm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x); + +// Reduce modulo group order, z := x mod n_sm2 +// Input x[4]; output z[4] +extern void bignum_mod_nsm2_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Reduce modulo field characteristic, z := x mod p_25519 +// Input x[4]; output z[4] +extern void bignum_mod_p25519_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Reduce modulo field characteristic, z := x mod p_256 +// Input x[k]; output z[4] +extern void bignum_mod_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x); +extern void bignum_mod_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x); + +// Reduce modulo field characteristic, z := x mod p_256 +// Input x[4]; output z[4] +extern void bignum_mod_p256_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Reduce modulo field characteristic, z := x mod p_256k1 +// Input x[4]; output z[4] +extern void bignum_mod_p256k1_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Reduce modulo field characteristic, z := x mod p_384 +// Input x[k]; output z[6] +extern void bignum_mod_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x); +extern void bignum_mod_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t k, const uint64_t *x); + +// Reduce modulo field characteristic, z := x mod p_384 +// Input x[6]; output z[6] +extern void bignum_mod_p384_6 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Reduce modulo field characteristic, z := x mod p_521 +// Input x[9]; output z[9] +extern void bignum_mod_p521_9 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Reduce modulo field characteristic, z := x mod p_sm2 +// Input x[k]; output z[4] +extern void bignum_mod_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t k, const uint64_t *x); + +// Reduce modulo field characteristic, z := x mod p_sm2 +// Input x[4]; output z[4] +extern void bignum_mod_sm2_4 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Add modulo m, z := (x + y) mod m, assuming x and y reduced +// Inputs x[k], y[k], m[k]; output z[k] +extern void bignum_modadd (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m); + +// Double modulo m, z := (2 * x) mod m, assuming x reduced +// Inputs x[k], m[k]; output z[k] +extern void bignum_moddouble (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m); + +// Modular exponentiation for arbitrary odd modulus, z := (a^p) mod m +// Inputs a[k], p[k], m[k]; output z[k], temporary buffer t[>=3*k] +extern void bignum_modexp(uint64_t k,uint64_t *z, const uint64_t *a,const uint64_t *p,const uint64_t *m,uint64_t *t); + +// Compute "modification" constant z := 2^{64k} mod m +// Input m[k]; output z[k]; temporary buffer t[>=k] +extern void bignum_modifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t); + +// Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, a coprime to b +// Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k] +extern void bignum_modinv (uint64_t k, uint64_t *z, const uint64_t *a, const uint64_t *b, uint64_t *t); + +// Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x (if p zero), assuming x reduced +// Inputs p, x[k], m[k]; output z[k] +extern void bignum_modoptneg (uint64_t k, uint64_t *z, uint64_t p, const uint64_t *x, const uint64_t *m); + +// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced +// Inputs x[k], y[k], m[k]; output z[k] +extern void bignum_modsub (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m); + +// Compute "montification" constant z := 2^{128k} mod m +// Input m[k]; output z[k]; temporary buffer t[>=k] +extern void bignum_montifier (uint64_t k, uint64_t *z, const uint64_t *m, uint64_t *t); + +// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 +// Input x[4]; output z[4] +extern void bignum_montinv_p256(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Montgomery inverse modulo p_384 = 2^384 - 2^128 - 2^96 + 2^32 - 1 +// Input x[6]; output z[6] +extern void bignum_montinv_p384(uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Montgomery inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1 +// Input x[4]; output z[4] +extern void bignum_montinv_sm2(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Montgomery multiply, z := (x * y / 2^{64k}) mod m +// Inputs x[k], y[k], m[k]; output z[k] +extern void bignum_montmul (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y, const uint64_t *m); + +// Montgomery multiply, z := (x * y / 2^256) mod p_256 +// Inputs x[4], y[4]; output z[4] +extern void bignum_montmul_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); +extern void bignum_montmul_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Montgomery multiply, z := (x * y / 2^256) mod p_256k1 +// Inputs x[4], y[4]; output z[4] +extern void bignum_montmul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); +extern void bignum_montmul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Montgomery multiply, z := (x * y / 2^384) mod p_384 +// Inputs x[6], y[6]; output z[6] +extern void bignum_montmul_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]); +extern void bignum_montmul_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]); + +// Montgomery multiply, z := (x * y / 2^576) mod p_521 +// Inputs x[9], y[9]; output z[9] +extern void bignum_montmul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]); +extern void bignum_montmul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]); + +// Montgomery multiply, z := (x * y / 2^256) mod p_sm2 +// Inputs x[4], y[4]; output z[4] +extern void bignum_montmul_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); +extern void bignum_montmul_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Montgomery reduce, z := (x' / 2^{64p}) MOD m +// Inputs x[n], m[k], p; output z[k] +extern void bignum_montredc (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, const uint64_t *m, uint64_t p); + +// Montgomery square, z := (x^2 / 2^{64k}) mod m +// Inputs x[k], m[k]; output z[k] +extern void bignum_montsqr (uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *m); + +// Montgomery square, z := (x^2 / 2^256) mod p_256 +// Input x[4]; output z[4] +extern void bignum_montsqr_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_montsqr_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Montgomery square, z := (x^2 / 2^256) mod p_256k1 +// Input x[4]; output z[4] +extern void bignum_montsqr_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_montsqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Montgomery square, z := (x^2 / 2^384) mod p_384 +// Input x[6]; output z[6] +extern void bignum_montsqr_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); +extern void bignum_montsqr_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Montgomery square, z := (x^2 / 2^576) mod p_521 +// Input x[9]; output z[9] +extern void bignum_montsqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); +extern void bignum_montsqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Montgomery square, z := (x^2 / 2^256) mod p_sm2 +// Input x[4]; output z[4] +extern void bignum_montsqr_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_montsqr_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Multiply z := x * y +// Inputs x[m], y[n]; output z[k] +extern void bignum_mul (uint64_t k, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y); + +// Multiply z := x * y +// Inputs x[4], y[4]; output z[8] +extern void bignum_mul_4_8 (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); +extern void bignum_mul_4_8_alt (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Multiply z := x * y +// Inputs x[6], y[6]; output z[12] +extern void bignum_mul_6_12 (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]); +extern void bignum_mul_6_12_alt (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]); + +// Multiply z := x * y +// Inputs x[8], y[8]; output z[16] +extern void bignum_mul_8_16 (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8], const uint64_t y[S2N_BIGNUM_STATIC 8]); +extern void bignum_mul_8_16_alt (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8], const uint64_t y[S2N_BIGNUM_STATIC 8]); + +// Multiply modulo p_25519, z := (x * y) mod p_25519 +// Inputs x[4], y[4]; output z[4] +extern void bignum_mul_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); +extern void bignum_mul_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Multiply modulo p_256k1, z := (x * y) mod p_256k1 +// Inputs x[4], y[4]; output z[4] +extern void bignum_mul_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); +extern void bignum_mul_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced +// Inputs x[9], y[9]; output z[9] +extern void bignum_mul_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]); +extern void bignum_mul_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]); + +// Multiply bignum by 10 and add word: z := 10 * z + d +// Inputs z[k], d; outputs function return (carry) and z[k] +extern uint64_t bignum_muladd10 (uint64_t k, uint64_t *z, uint64_t d); + +// Multiplex/select z := x (if p nonzero) or z := y (if p zero) +// Inputs p, x[k], y[k]; output z[k] +extern void bignum_mux (uint64_t p, uint64_t k, uint64_t *z, const uint64_t *x, const uint64_t *y); + +// 256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) +// Inputs p, x[4], y[4]; output z[4] +extern void bignum_mux_4 (uint64_t p, uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// 384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) +// Inputs p, x[6], y[6]; output z[6] +extern void bignum_mux_6 (uint64_t p, uint64_t z[S2N_BIGNUM_STATIC 6],const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]); + +// Select element from 16-element table, z := xs[k*i] +// Inputs xs[16*k], i; output z[k] +extern void bignum_mux16 (uint64_t k, uint64_t *z, const uint64_t *xs, uint64_t i); + +// Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_neg_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Negate modulo p_256, z := (-x) mod p_256, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_neg_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_neg_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Negate modulo p_384, z := (-x) mod p_384, assuming x reduced +// Input x[6]; output z[6] +extern void bignum_neg_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Negate modulo p_521, z := (-x) mod p_521, assuming x reduced +// Input x[9]; output z[9] +extern void bignum_neg_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Negate modulo p_sm2, z := (-x) mod p_sm2, assuming x reduced +// Input x[4]; output z[4] +extern void bignum_neg_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Negated modular inverse, z := (-1/x) mod 2^{64k} +// Input x[k]; output z[k] +extern void bignum_negmodinv (uint64_t k, uint64_t *z, const uint64_t *x); + +// Test bignum for nonzero-ness x =/= 0 +// Input x[k]; output function return +extern uint64_t bignum_nonzero (uint64_t k, const uint64_t *x); + +// Test 256-bit bignum for nonzero-ness x =/= 0 +// Input x[4]; output function return +extern uint64_t bignum_nonzero_4(const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Test 384-bit bignum for nonzero-ness x =/= 0 +// Input x[6]; output function return +extern uint64_t bignum_nonzero_6(const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Normalize bignum in-place by shifting left till top bit is 1 +// Input z[k]; outputs function return (bits shifted left) and z[k] +extern uint64_t bignum_normalize (uint64_t k, uint64_t *z); + +// Test bignum for odd-ness +// Input x[k]; output function return +extern uint64_t bignum_odd (uint64_t k, const uint64_t *x); + +// Convert single digit to bignum, z := n +// Input n; output z[k] +extern void bignum_of_word (uint64_t k, uint64_t *z, uint64_t n); + +// Optionally add, z := x + y (if p nonzero) or z := x (if p zero) +// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] +extern uint64_t bignum_optadd (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y); + +// Optionally negate, z := -x (if p nonzero) or z := x (if p zero) +// Inputs p, x[k]; outputs function return (nonzero input) and z[k] +extern uint64_t bignum_optneg (uint64_t k, uint64_t *z, uint64_t p, const uint64_t *x); + +// Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or z := x (if p zero), assuming x reduced +// Inputs p, x[4]; output z[4] +extern void bignum_optneg_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or z := x (if p zero), assuming x reduced +// Inputs p, x[4]; output z[4] +extern void bignum_optneg_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or z := x (if p zero), assuming x reduced +// Inputs p, x[4]; output z[4] +extern void bignum_optneg_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or z := x (if p zero), assuming x reduced +// Inputs p, x[6]; output z[6] +extern void bignum_optneg_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or z := x (if p zero), assuming x reduced +// Inputs p, x[9]; output z[9] +extern void bignum_optneg_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Optionally negate modulo p_sm2, z := (-x) mod p_sm2 (if p nonzero) or z := x (if p zero), assuming x reduced +// Inputs p, x[4]; output z[4] +extern void bignum_optneg_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t p, const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero) +// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] +extern uint64_t bignum_optsub (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y); + +// Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed +// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] +extern uint64_t bignum_optsubadd (uint64_t k, uint64_t *z, const uint64_t *x, uint64_t p, const uint64_t *y); + +// Return bignum of power of 2, z := 2^n +// Input n; output z[k] +extern void bignum_pow2 (uint64_t k, uint64_t *z, uint64_t n); + +// Shift bignum left by c < 64 bits z := x * 2^c +// Inputs x[n], c; outputs function return (carry-out) and z[k] +extern uint64_t bignum_shl_small (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t c); + +// Shift bignum right by c < 64 bits z := floor(x / 2^c) +// Inputs x[n], c; outputs function return (bits shifted out) and z[k] +extern uint64_t bignum_shr_small (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x, uint64_t c); + +// Square, z := x^2 +// Input x[n]; output z[k] +extern void bignum_sqr (uint64_t k, uint64_t *z, uint64_t n, const uint64_t *x); + +// Square, z := x^2 +// Input x[4]; output z[8] +extern void bignum_sqr_4_8 (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_sqr_4_8_alt (uint64_t z[S2N_BIGNUM_STATIC 8], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Square, z := x^2 +// Input x[6]; output z[12] +extern void bignum_sqr_6_12 (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6]); +extern void bignum_sqr_6_12_alt (uint64_t z[S2N_BIGNUM_STATIC 12], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Square, z := x^2 +// Input x[8]; output z[16] +extern void bignum_sqr_8_16 (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8]); +extern void bignum_sqr_8_16_alt (uint64_t z[S2N_BIGNUM_STATIC 16], const uint64_t x[S2N_BIGNUM_STATIC 8]); + +// Square modulo p_25519, z := (x^2) mod p_25519 +// Input x[4]; output z[4] +extern void bignum_sqr_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_sqr_p25519_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Square modulo p_256k1, z := (x^2) mod p_256k1 +// Input x[4]; output z[4] +extern void bignum_sqr_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_sqr_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Square modulo p_521, z := (x^2) mod p_521, assuming x reduced +// Input x[9]; output z[9] +extern void bignum_sqr_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); +extern void bignum_sqr_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Square root modulo p_25519 +// Input x[4]; output function return (Legendre symbol) and z[4] +extern int64_t bignum_sqrt_p25519(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern int64_t bignum_sqrt_p25519_alt(uint64_t z[S2N_BIGNUM_STATIC 4],const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Subtract, z := x - y +// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] +extern uint64_t bignum_sub (uint64_t p, uint64_t *z, uint64_t m, const uint64_t *x, uint64_t n, const uint64_t *y); + +// Subtract modulo p_25519, z := (x - y) mod p_25519, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +extern void bignum_sub_p25519 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Subtract modulo p_256, z := (x - y) mod p_256, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +extern void bignum_sub_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Subtract modulo p_256k1, z := (x - y) mod p_256k1, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +extern void bignum_sub_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Subtract modulo p_384, z := (x - y) mod p_384, assuming x and y reduced +// Inputs x[6], y[6]; output z[6] +extern void bignum_sub_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]); + +// Subtract modulo p_521, z := (x - y) mod p_521, assuming x and y reduced +// Inputs x[9], y[9]; output z[9] +extern void bignum_sub_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]); + +// Subtract modulo p_sm2, z := (x - y) mod p_sm2, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +extern void bignum_sub_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4], const uint64_t y[S2N_BIGNUM_STATIC 4]); + +// Convert 4-digit (256-bit) bignum to big-endian bytes +// Input x[4]; output z[32] (bytes) +extern void bignum_tobebytes_4 (uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Convert 6-digit (384-bit) bignum to big-endian bytes +// Input x[6]; output z[48] (bytes) +extern void bignum_tobebytes_6 (uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Convert 4-digit (256-bit) bignum to little-endian bytes +// Input x[4]; output z[32] (bytes) +extern void bignum_tolebytes_4 (uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Convert 6-digit (384-bit) bignum to little-endian bytes +// Input x[6]; output z[48] (bytes) +extern void bignum_tolebytes_6 (uint8_t z[S2N_BIGNUM_STATIC 48], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Convert 9-digit 528-bit bignum to little-endian bytes +// Input x[6]; output z[66] (bytes) +extern void bignum_tolebytes_p521 (uint8_t z[S2N_BIGNUM_STATIC 66], const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Convert to Montgomery form z := (2^256 * x) mod p_256 +// Input x[4]; output z[4] +extern void bignum_tomont_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_tomont_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Convert to Montgomery form z := (2^256 * x) mod p_256k1 +// Input x[4]; output z[4] +extern void bignum_tomont_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_tomont_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Convert to Montgomery form z := (2^384 * x) mod p_384 +// Input x[6]; output z[6] +extern void bignum_tomont_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); +extern void bignum_tomont_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Convert to Montgomery form z := (2^576 * x) mod p_521 +// Input x[9]; output z[9] +extern void bignum_tomont_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Convert to Montgomery form z := (2^256 * x) mod p_sm2 +// Input x[4]; output z[4] +extern void bignum_tomont_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Triple modulo p_256, z := (3 * x) mod p_256 +// Input x[4]; output z[4] +extern void bignum_triple_p256 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_triple_p256_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Triple modulo p_256k1, z := (3 * x) mod p_256k1 +// Input x[4]; output z[4] +extern void bignum_triple_p256k1 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_triple_p256k1_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Triple modulo p_384, z := (3 * x) mod p_384 +// Input x[6]; output z[6] +extern void bignum_triple_p384 (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); +extern void bignum_triple_p384_alt (uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]); + +// Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced +// Input x[9]; output z[9] +extern void bignum_triple_p521 (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); +extern void bignum_triple_p521_alt (uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]); + +// Triple modulo p_sm2, z := (3 * x) mod p_sm2 +// Input x[4]; output z[4] +extern void bignum_triple_sm2 (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); +extern void bignum_triple_sm2_alt (uint64_t z[S2N_BIGNUM_STATIC 4], const uint64_t x[S2N_BIGNUM_STATIC 4]); + +// Montgomery ladder step for curve25519 +// Inputs point[8], pp[16], b; output rr[16] +extern void curve25519_ladderstep(uint64_t rr[16],const uint64_t point[8],const uint64_t pp[16],uint64_t b); +extern void curve25519_ladderstep_alt(uint64_t rr[16],const uint64_t point[8],const uint64_t pp[16],uint64_t b); + +// Projective scalar multiplication, x coordinate only, for curve25519 +// Inputs scalar[4], point[4]; output res[8] +extern void curve25519_pxscalarmul(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]); +extern void curve25519_pxscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]); + +// x25519 function for curve25519 +// Inputs scalar[4], point[4]; output res[4] +extern void curve25519_x25519(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]); +extern void curve25519_x25519_alt(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 4]); + +// x25519 function for curve25519 (byte array arguments) +// Inputs scalar[32] (bytes), point[32] (bytes); output res[32] (bytes) +extern void curve25519_x25519_byte(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32],const uint8_t point[S2N_BIGNUM_STATIC 32]); +extern void curve25519_x25519_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32],const uint8_t point[S2N_BIGNUM_STATIC 32]); + +// x25519 function for curve25519 on base element 9 +// Input scalar[4]; output res[4] +extern void curve25519_x25519base(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4]); +extern void curve25519_x25519base_alt(uint64_t res[S2N_BIGNUM_STATIC 4],const uint64_t scalar[S2N_BIGNUM_STATIC 4]); + +// x25519 function for curve25519 on base element 9 (byte array arguments) +// Input scalar[32] (bytes); output res[32] (bytes) +extern void curve25519_x25519base_byte(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32]); +extern void curve25519_x25519base_byte_alt(uint8_t res[S2N_BIGNUM_STATIC 32],const uint8_t scalar[S2N_BIGNUM_STATIC 32]); + +// Decode compressed 256-bit form of edwards25519 point +// Input c[32] (bytes); output function return and z[8] +extern uint64_t edwards25519_decode(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]); +extern uint64_t edwards25519_decode_alt(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]); + +// Encode edwards25519 point into compressed form as 256-bit number +// Input p[8]; output z[32] (bytes) +extern void edwards25519_encode(uint8_t z[S2N_BIGNUM_STATIC 32], const uint64_t p[S2N_BIGNUM_STATIC 8]); + +// Extended projective addition for edwards25519 +// Inputs p1[16], p2[16]; output p3[16] +extern void edwards25519_epadd(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 16]); +extern void edwards25519_epadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 16]); + +// Extended projective doubling for edwards25519 +// Inputs p1[12]; output p3[16] +extern void edwards25519_epdouble(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 12]); +extern void edwards25519_epdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 12]); + +// Projective doubling for edwards25519 +// Inputs p1[12]; output p3[12] +extern void edwards25519_pdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]); +extern void edwards25519_pdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]); + +// Extended projective + precomputed mixed addition for edwards25519 +// Inputs p1[16], p2[12]; output p3[16] +extern void edwards25519_pepadd(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 12]); +extern void edwards25519_pepadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 16],const uint64_t p1[S2N_BIGNUM_STATIC 16],const uint64_t p2[S2N_BIGNUM_STATIC 12]); + +// Scalar multiplication by standard basepoint for edwards25519 (Ed25519) +// Input scalar[4]; output res[8] +extern void edwards25519_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4]); +extern void edwards25519_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4]); + +// Double scalar multiplication for edwards25519, fresh and base point +// Input scalar[4], point[8], bscalar[4]; output res[8] +extern void edwards25519_scalarmuldouble(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]); +extern void edwards25519_scalarmuldouble_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4], const uint64_t point[S2N_BIGNUM_STATIC 8],const uint64_t bscalar[S2N_BIGNUM_STATIC 4]); + +// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates +// Inputs p1[12], p2[12]; output p3[12] +extern void p256_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]); +extern void p256_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]); + +// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates +// Inputs p1[12]; output p3[12] +extern void p256_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]); +extern void p256_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]); + +// Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates +// Inputs p1[12], p2[8]; output p3[12] +extern void p256_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]); +extern void p256_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]); + +// Montgomery-Jacobian form scalar multiplication for P-256 +// Input scalar[4], point[12]; output res[12] +extern void p256_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]); +extern void p256_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]); + +// Scalar multiplication for NIST curve P-256 +// Input scalar[4], point[8]; output res[8] +extern void p256_scalarmul(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 8]); +extern void p256_scalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 8]); + +// Scalar multiplication for precomputed point on NIST curve P-256 +// Input scalar[4], blocksize, table[]; output res[8] +extern void p256_scalarmulbase(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],uint64_t blocksize,const uint64_t *table); +extern void p256_scalarmulbase_alt(uint64_t res[S2N_BIGNUM_STATIC 8],const uint64_t scalar[S2N_BIGNUM_STATIC 4],uint64_t blocksize,const uint64_t *table); + +// Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// Inputs p1[18], p2[18]; output p3[18] +extern void p384_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 18]); +extern void p384_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 18]); + +// Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates +// Inputs p1[18]; output p3[18] +extern void p384_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18]); +extern void p384_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18]); + +// Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates +// Inputs p1[18], p2[12]; output p3[18] +extern void p384_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 12]); +extern void p384_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 18],const uint64_t p1[S2N_BIGNUM_STATIC 18],const uint64_t p2[S2N_BIGNUM_STATIC 12]); + +// Montgomery-Jacobian form scalar multiplication for P-384 +// Input scalar[6], point[18]; output res[18] +extern void p384_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 18],const uint64_t scalar[S2N_BIGNUM_STATIC 6],const uint64_t point[S2N_BIGNUM_STATIC 18]); +extern void p384_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 18],const uint64_t scalar[S2N_BIGNUM_STATIC 6],const uint64_t point[S2N_BIGNUM_STATIC 18]); + +// Point addition on NIST curve P-521 in Jacobian coordinates +// Inputs p1[27], p2[27]; output p3[27] +extern void p521_jadd(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 27]); +extern void p521_jadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 27]); + +// Point doubling on NIST curve P-521 in Jacobian coordinates +// Input p1[27]; output p3[27] +extern void p521_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27]); +extern void p521_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27]); + +// Point mixed addition on NIST curve P-521 in Jacobian coordinates +// Inputs p1[27], p2[18]; output p3[27] +extern void p521_jmixadd(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 18]); +extern void p521_jmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],const uint64_t p1[S2N_BIGNUM_STATIC 27],const uint64_t p2[S2N_BIGNUM_STATIC 18]); + +// Jacobian form scalar multiplication for P-521 +// Input scalar[9], point[27]; output res[27] +extern void p521_jscalarmul(uint64_t res[S2N_BIGNUM_STATIC 27],const uint64_t scalar[S2N_BIGNUM_STATIC 9],const uint64_t point[S2N_BIGNUM_STATIC 27]); +extern void p521_jscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 27],const uint64_t scalar[S2N_BIGNUM_STATIC 9],const uint64_t point[S2N_BIGNUM_STATIC 27]); + +// Point addition on SECG curve secp256k1 in Jacobian coordinates +// Inputs p1[12], p2[12]; output p3[12] +extern void secp256k1_jadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]); +extern void secp256k1_jadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]); + +// Point doubling on SECG curve secp256k1 in Jacobian coordinates +// Input p1[12]; output p3[12] +extern void secp256k1_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]); +extern void secp256k1_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]); + +// Point mixed addition on SECG curve secp256k1 in Jacobian coordinates +// Inputs p1[12], p2[8]; output p3[12] +extern void secp256k1_jmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]); +extern void secp256k1_jmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]); + +// Point addition on CC curve SM2 in Montgomery-Jacobian coordinates +// Inputs p1[12], p2[12]; output p3[12] +extern void sm2_montjadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]); +extern void sm2_montjadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 12]); + +// Point doubling on CC curve SM2 in Montgomery-Jacobian coordinates +// Inputs p1[12]; output p3[12] +extern void sm2_montjdouble(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]); +extern void sm2_montjdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12]); + +// Point mixed addition on CC curve SM2 in Montgomery-Jacobian coordinates +// Inputs p1[12], p2[8]; output p3[12] +extern void sm2_montjmixadd(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]); +extern void sm2_montjmixadd_alt(uint64_t p3[S2N_BIGNUM_STATIC 12],const uint64_t p1[S2N_BIGNUM_STATIC 12],const uint64_t p2[S2N_BIGNUM_STATIC 8]); + +// Montgomery-Jacobian form scalar multiplication for CC curve SM2 +// Input scalar[4], point[12]; output res[12] +extern void sm2_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]); +extern void sm2_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12],const uint64_t scalar[S2N_BIGNUM_STATIC 4],const uint64_t point[S2N_BIGNUM_STATIC 12]); + +// Reverse the bytes in a single word +// Input a; output function return +extern uint64_t word_bytereverse (uint64_t a); + +// Count leading zero bits in a single word +// Input a; output function return +extern uint64_t word_clz (uint64_t a); + +// Count trailing zero bits in a single word +// Input a; output function return +extern uint64_t word_ctz (uint64_t a); + +// Perform 59 "divstep" iterations and return signed matrix of updates +// Inputs d, f, g; output m[2][2] and function return +extern int64_t word_divstep59(int64_t m[2][2],int64_t d,uint64_t f,uint64_t g); + +// Return maximum of two unsigned 64-bit words +// Inputs a, b; output function return +extern uint64_t word_max (uint64_t a, uint64_t b); + +// Return minimum of two unsigned 64-bit words +// Inputs a, b; output function return +extern uint64_t word_min (uint64_t a, uint64_t b); + +// Single-word negated modular inverse (-1/a) mod 2^64 +// Input a; output function return +extern uint64_t word_negmodinv (uint64_t a); + +// Count number of set bits in a single 64-bit word (population count) +// Input a; output function return +extern uint64_t word_popcount (uint64_t a); + +// Single-word reciprocal, 2^64 + ret = ceil(2^128/a) - 1 if MSB of "a" is set +// Input a; output function return +extern uint64_t word_recip (uint64_t a); diff --git a/third_party/s2n-bignum/s2n-bignum-imported/non_ct_functions.txt b/third_party/s2n-bignum/s2n-bignum-imported/non_ct_functions.txt new file mode 100644 index 00000000000..5b9fe753cd1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/non_ct_functions.txt @@ -0,0 +1,8 @@ +p256/bignum_mod_n256o: +p256/bignum_mod_n256_alto: +p256/bignum_mod_p256o: +p256/bignum_mod_p256_alto: +p384/bignum_mod_n384o: +p384/bignum_mod_n384_alto: +p384/bignum_mod_p384o: +p384/bignum_mod_p384_alto: diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/Makefile b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/Makefile new file mode 100644 index 00000000000..075ec11a61f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/Makefile @@ -0,0 +1,343 @@ +############################################################################# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +############################################################################# + +# These are the object files corresponding to sources for translation + +OBJ = curve25519/bignum_add_p25519.o \ + curve25519/bignum_cmul_p25519.o \ + curve25519/bignum_cmul_p25519_alt.o \ + curve25519/bignum_double_p25519.o \ + curve25519/bignum_inv_p25519.o \ + curve25519/bignum_invsqrt_p25519.o \ + curve25519/bignum_invsqrt_p25519_alt.o \ + curve25519/bignum_madd_n25519.o \ + curve25519/bignum_madd_n25519_alt.o \ + curve25519/bignum_mod_m25519_4.o \ + curve25519/bignum_mod_n25519.o \ + curve25519/bignum_mod_n25519_4.o \ + curve25519/bignum_mod_p25519_4.o \ + curve25519/bignum_mul_p25519.o \ + curve25519/bignum_mul_p25519_alt.o \ + curve25519/bignum_neg_p25519.o \ + curve25519/bignum_optneg_p25519.o \ + curve25519/bignum_sqr_p25519.o \ + curve25519/bignum_sqr_p25519_alt.o \ + curve25519/bignum_sqrt_p25519.o \ + curve25519/bignum_sqrt_p25519_alt.o \ + curve25519/bignum_sub_p25519.o \ + curve25519/curve25519_ladderstep.o \ + curve25519/curve25519_ladderstep_alt.o \ + curve25519/curve25519_pxscalarmul.o \ + curve25519/curve25519_pxscalarmul_alt.o \ + curve25519/curve25519_x25519.o \ + curve25519/curve25519_x25519_alt.o \ + curve25519/curve25519_x25519base.o \ + curve25519/curve25519_x25519base_alt.o \ + curve25519/edwards25519_decode.o \ + curve25519/edwards25519_decode_alt.o \ + curve25519/edwards25519_encode.o \ + curve25519/edwards25519_epadd.o \ + curve25519/edwards25519_epadd_alt.o \ + curve25519/edwards25519_epdouble.o \ + curve25519/edwards25519_epdouble_alt.o \ + curve25519/edwards25519_pdouble.o \ + curve25519/edwards25519_pdouble_alt.o \ + curve25519/edwards25519_pepadd.o \ + curve25519/edwards25519_pepadd_alt.o \ + curve25519/edwards25519_scalarmulbase.o \ + curve25519/edwards25519_scalarmulbase_alt.o \ + curve25519/edwards25519_scalarmuldouble.o \ + curve25519/edwards25519_scalarmuldouble_alt.o \ + fastmul/bignum_emontredc_8n.o \ + fastmul/bignum_kmul_16_32.o \ + fastmul/bignum_kmul_32_64.o \ + fastmul/bignum_ksqr_16_32.o \ + fastmul/bignum_ksqr_32_64.o \ + fastmul/bignum_mul_4_8.o \ + fastmul/bignum_mul_4_8_alt.o \ + fastmul/bignum_mul_6_12.o \ + fastmul/bignum_mul_6_12_alt.o \ + fastmul/bignum_mul_8_16.o \ + fastmul/bignum_mul_8_16_alt.o \ + fastmul/bignum_sqr_4_8.o \ + fastmul/bignum_sqr_4_8_alt.o \ + fastmul/bignum_sqr_6_12.o \ + fastmul/bignum_sqr_6_12_alt.o \ + fastmul/bignum_sqr_8_16.o \ + fastmul/bignum_sqr_8_16_alt.o \ + generic/bignum_add.o \ + generic/bignum_amontifier.o \ + generic/bignum_amontmul.o \ + generic/bignum_amontredc.o \ + generic/bignum_amontsqr.o \ + generic/bignum_bitfield.o \ + generic/bignum_bitsize.o \ + generic/bignum_cdiv.o \ + generic/bignum_cdiv_exact.o \ + generic/bignum_cld.o \ + generic/bignum_clz.o \ + generic/bignum_cmadd.o \ + generic/bignum_cmnegadd.o \ + generic/bignum_cmod.o \ + generic/bignum_cmul.o \ + generic/bignum_coprime.o \ + generic/bignum_copy.o \ + generic/bignum_ctd.o \ + generic/bignum_ctz.o \ + generic/bignum_demont.o \ + generic/bignum_digit.o \ + generic/bignum_digitsize.o \ + generic/bignum_divmod10.o \ + generic/bignum_emontredc.o \ + generic/bignum_eq.o \ + generic/bignum_even.o \ + generic/bignum_ge.o \ + generic/bignum_gt.o \ + generic/bignum_iszero.o \ + generic/bignum_le.o \ + generic/bignum_lt.o \ + generic/bignum_madd.o \ + generic/bignum_modadd.o \ + generic/bignum_moddouble.o \ + generic/bignum_modexp.o \ + generic/bignum_modifier.o \ + generic/bignum_modinv.o \ + generic/bignum_modoptneg.o \ + generic/bignum_modsub.o \ + generic/bignum_montifier.o \ + generic/bignum_montmul.o \ + generic/bignum_montredc.o \ + generic/bignum_montsqr.o \ + generic/bignum_mul.o \ + generic/bignum_muladd10.o \ + generic/bignum_mux.o \ + generic/bignum_mux16.o \ + generic/bignum_negmodinv.o \ + generic/bignum_nonzero.o \ + generic/bignum_normalize.o \ + generic/bignum_odd.o \ + generic/bignum_of_word.o \ + generic/bignum_optadd.o \ + generic/bignum_optneg.o \ + generic/bignum_optsub.o \ + generic/bignum_optsubadd.o \ + generic/bignum_pow2.o \ + generic/bignum_shl_small.o \ + generic/bignum_shr_small.o \ + generic/bignum_sqr.o \ + generic/bignum_sub.o \ + generic/word_bytereverse.o \ + generic/word_clz.o \ + generic/word_ctz.o \ + generic/word_divstep59.o \ + generic/word_max.o \ + generic/word_min.o \ + generic/word_negmodinv.o \ + generic/word_popcount.o \ + generic/word_recip.o \ + p256/bignum_add_p256.o \ + p256/bignum_bigendian_4.o \ + p256/bignum_cmul_p256.o \ + p256/bignum_cmul_p256_alt.o \ + p256/bignum_deamont_p256.o \ + p256/bignum_deamont_p256_alt.o \ + p256/bignum_demont_p256.o \ + p256/bignum_demont_p256_alt.o \ + p256/bignum_double_p256.o \ + p256/bignum_half_p256.o \ + p256/bignum_inv_p256.o \ + p256/bignum_littleendian_4.o \ + p256/bignum_mod_n256.o \ + p256/bignum_mod_n256_alt.o \ + p256/bignum_mod_n256_4.o \ + p256/bignum_mod_p256.o \ + p256/bignum_mod_p256_alt.o \ + p256/bignum_mod_p256_4.o \ + p256/bignum_montinv_p256.o \ + p256/bignum_montmul_p256.o \ + p256/bignum_montmul_p256_alt.o \ + p256/bignum_montsqr_p256.o \ + p256/bignum_montsqr_p256_alt.o \ + p256/bignum_mux_4.o \ + p256/bignum_neg_p256.o \ + p256/bignum_nonzero_4.o \ + p256/bignum_optneg_p256.o \ + p256/bignum_sub_p256.o \ + p256/bignum_tomont_p256.o \ + p256/bignum_tomont_p256_alt.o \ + p256/bignum_triple_p256.o \ + p256/bignum_triple_p256_alt.o \ + p256/p256_montjadd.o \ + p256/p256_montjadd_alt.o \ + p256/p256_montjdouble.o \ + p256/p256_montjdouble_alt.o \ + p256/p256_montjmixadd.o \ + p256/p256_montjmixadd_alt.o \ + p256/p256_montjscalarmul.o \ + p256/p256_montjscalarmul_alt.o \ + p256/p256_scalarmul.o \ + p256/p256_scalarmul_alt.o \ + p256/p256_scalarmulbase.o \ + p256/p256_scalarmulbase_alt.o \ + p384/bignum_add_p384.o \ + p384/bignum_bigendian_6.o \ + p384/bignum_cmul_p384.o \ + p384/bignum_cmul_p384_alt.o \ + p384/bignum_deamont_p384.o \ + p384/bignum_deamont_p384_alt.o \ + p384/bignum_demont_p384.o \ + p384/bignum_demont_p384_alt.o \ + p384/bignum_double_p384.o \ + p384/bignum_half_p384.o \ + p384/bignum_inv_p384.o \ + p384/bignum_littleendian_6.o \ + p384/bignum_mod_n384_alt.o \ + p384/bignum_mod_n384.o \ + p384/bignum_mod_n384_6.o \ + p384/bignum_mod_p384.o \ + p384/bignum_mod_p384_alt.o \ + p384/bignum_mod_p384_6.o \ + p384/bignum_montinv_p384.o \ + p384/bignum_montmul_p384.o \ + p384/bignum_montmul_p384_alt.o \ + p384/bignum_montsqr_p384.o \ + p384/bignum_montsqr_p384_alt.o \ + p384/bignum_mux_6.o \ + p384/bignum_neg_p384.o \ + p384/bignum_nonzero_6.o \ + p384/bignum_optneg_p384.o \ + p384/bignum_sub_p384.o \ + p384/bignum_tomont_p384.o \ + p384/bignum_tomont_p384_alt.o \ + p384/bignum_triple_p384.o \ + p384/bignum_triple_p384_alt.o \ + p384/p384_montjadd.o \ + p384/p384_montjadd_alt.o \ + p384/p384_montjdouble.o \ + p384/p384_montjdouble_alt.o \ + p384/p384_montjmixadd.o \ + p384/p384_montjmixadd_alt.o \ + p384/p384_montjscalarmul.o \ + p384/p384_montjscalarmul_alt.o \ + p521/bignum_add_p521.o \ + p521/bignum_cmul_p521.o \ + p521/bignum_cmul_p521_alt.o \ + p521/bignum_deamont_p521.o \ + p521/bignum_demont_p521.o \ + p521/bignum_double_p521.o \ + p521/bignum_fromlebytes_p521.o \ + p521/bignum_half_p521.o \ + p521/bignum_inv_p521.o \ + p521/bignum_mod_n521_9.o \ + p521/bignum_mod_n521_9_alt.o \ + p521/bignum_mod_p521_9.o \ + p521/bignum_montmul_p521.o \ + p521/bignum_montmul_p521_alt.o \ + p521/bignum_montsqr_p521.o \ + p521/bignum_montsqr_p521_alt.o \ + p521/bignum_mul_p521.o \ + p521/bignum_mul_p521_alt.o \ + p521/bignum_neg_p521.o \ + p521/bignum_optneg_p521.o \ + p521/bignum_sqr_p521.o \ + p521/bignum_sqr_p521_alt.o \ + p521/bignum_sub_p521.o \ + p521/bignum_tolebytes_p521.o \ + p521/bignum_tomont_p521.o \ + p521/bignum_triple_p521.o \ + p521/bignum_triple_p521_alt.o \ + p521/p521_jadd.o \ + p521/p521_jadd_alt.o \ + p521/p521_jdouble.o \ + p521/p521_jdouble_alt.o \ + p521/p521_jmixadd.o \ + p521/p521_jmixadd_alt.o \ + p521/p521_jscalarmul.o \ + p521/p521_jscalarmul_alt.o \ + secp256k1/bignum_add_p256k1.o \ + secp256k1/bignum_cmul_p256k1.o \ + secp256k1/bignum_cmul_p256k1_alt.o \ + secp256k1/bignum_deamont_p256k1.o \ + secp256k1/bignum_demont_p256k1.o \ + secp256k1/bignum_double_p256k1.o \ + secp256k1/bignum_half_p256k1.o \ + secp256k1/bignum_mod_n256k1_4.o \ + secp256k1/bignum_mod_p256k1_4.o \ + secp256k1/bignum_montmul_p256k1.o \ + secp256k1/bignum_montmul_p256k1_alt.o \ + secp256k1/bignum_montsqr_p256k1.o \ + secp256k1/bignum_montsqr_p256k1_alt.o \ + secp256k1/bignum_mul_p256k1.o \ + secp256k1/bignum_mul_p256k1_alt.o \ + secp256k1/bignum_neg_p256k1.o \ + secp256k1/bignum_optneg_p256k1.o \ + secp256k1/bignum_sqr_p256k1.o \ + secp256k1/bignum_sqr_p256k1_alt.o \ + secp256k1/bignum_sub_p256k1.o \ + secp256k1/bignum_tomont_p256k1.o \ + secp256k1/bignum_tomont_p256k1_alt.o \ + secp256k1/bignum_triple_p256k1.o \ + secp256k1/bignum_triple_p256k1_alt.o \ + secp256k1/secp256k1_jadd.o \ + secp256k1/secp256k1_jadd_alt.o \ + secp256k1/secp256k1_jdouble.o \ + secp256k1/secp256k1_jdouble_alt.o \ + secp256k1/secp256k1_jmixadd.o \ + secp256k1/secp256k1_jmixadd_alt.o \ + sm2/bignum_add_sm2.o \ + sm2/bignum_cmul_sm2.o \ + sm2/bignum_cmul_sm2_alt.o \ + sm2/bignum_deamont_sm2.o \ + sm2/bignum_demont_sm2.o \ + sm2/bignum_double_sm2.o \ + sm2/bignum_half_sm2.o \ + sm2/bignum_inv_sm2.o \ + sm2/bignum_mod_nsm2.o \ + sm2/bignum_mod_nsm2_alt.o \ + sm2/bignum_mod_nsm2_4.o \ + sm2/bignum_mod_sm2.o \ + sm2/bignum_mod_sm2_4.o \ + sm2/bignum_montinv_sm2.o \ + sm2/bignum_montmul_sm2.o \ + sm2/bignum_montmul_sm2_alt.o \ + sm2/bignum_montsqr_sm2.o \ + sm2/bignum_montsqr_sm2_alt.o \ + sm2/bignum_neg_sm2.o \ + sm2/bignum_optneg_sm2.o \ + sm2/bignum_sub_sm2.o \ + sm2/bignum_tomont_sm2.o \ + sm2/bignum_triple_sm2.o \ + sm2/bignum_triple_sm2_alt.o \ + sm2/sm2_montjadd.o \ + sm2/sm2_montjadd_alt.o \ + sm2/sm2_montjdouble.o \ + sm2/sm2_montjdouble_alt.o \ + sm2/sm2_montjmixadd.o \ + sm2/sm2_montjmixadd_alt.o \ + sm2/sm2_montjscalarmul.o \ + sm2/sm2_montjscalarmul_alt.o + +# The AT&T syntax source files + +ATTSOURCES = $(OBJ:.o=.S) + +code: $(ATTSOURCES) + +all: $(OBJ); + +%.o : %.S ; ($(CC) -E -I../include $< | as -o $@ -); (cd ../x86; $(CC) -E -I../include $< | as -o /tmp/original_object.o); cmp -s $@ /tmp/original_object.o + +curve25519/%.S :: ../x86/curve25519/%.S ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj) +fastmul/%.S :: ../x86/fastmul/%.S ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj) +generic/%.S :: ../x86/generic/%.S ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj) +p256/%.S :: ../x86/p256/%.S ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj) +p384/%.S :: ../x86/p384/%.S ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj) +p521/%.S :: ../x86/p521/%.S ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj) +secp256k1/%.S :: ../x86/secp256k1/%.S ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj) +sm2/%.S :: ../x86/sm2/%.S ; (cat $< | sed -E -f ./attrofy.sed >$@) && ($(CC) -E -I../include -DWINDOWS_ABI=0 $@ | as -o /tmp/translated_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $@ | as -o /tmp/translated_object.obj) ; ($(CC) -E -I../include -DWINDOWS_ABI=0 $< | as -o /tmp/original_object.o) ; ($(CC) -E -I../include -DWINDOWS_ABI=1 $< | as -o /tmp/original_object.obj) ; (cmp -s /tmp/translated_object.o /tmp/original_object.o && cmp -s /tmp/translated_object.obj /tmp/original_object.obj) + +clean:; rm -f */*.o + +clobber:; rm -f */*.o */*.S diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/README.md b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/README.md new file mode 100644 index 00000000000..2dd37851cb3 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/README.md @@ -0,0 +1,16 @@ +## AT&T syntax versions + +This directory contains AT&T syntax equivalents of the original Intel +syntax assembler files, generated automatically by a naive script and +subject to a sanity check that the object code doesn't change. All the +*/*.S files are generated ("make code"). Direct modification of these +files is not recommended. + + make code --- Generate */*.S files, subject to sanity check + make all --- Generate */*.S and */*.o files with sanity check + make clean --- Delete object files + make clobber --- Delete object files and generated code + +For more on the two syntax variants see: + + https://en.wikipedia.org/wiki/X86_assembly_language#Syntax diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/attrofy.sed b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/attrofy.sed new file mode 100644 index 00000000000..40547107795 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/attrofy.sed @@ -0,0 +1,136 @@ +############################################################################# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +############################################################################# + + ############################################################################ + # * * * NOTE * * * # + # # + # This is a primitive script to automate conversion of certain particular # + # x86 assembler files from Intel to AT&T syntax. It is *not* a general # + # conversion and is very tied to the specific limitations and conventions # + # in the intended targets. Even in that setting we only use it with an # + # additional sanity check that the object code generated is the same in # + # both original and translated code according to the GNU assembler. # + ############################################################################ + +s/\.intel_syntax *noprefix// + +# Don't make any transforms on lines with the argument-taking macros + +/ addrow .+,/b +/ mulpadd .+,/b +/ mulpadda .+,/b +/ mulpade .+,/b +/ mulrow .+,/b + +# Reverse the argument order for binary and ternary instructions + +s/^(([a-z_0-9]+\:)* +[a-z_0-9]+ +)([^ (][^,/]*), *([^ ][^/,;]*)([/;].*)*$/\1\4, \3 \5/ +s/^(([a-z_0-9]+\:)* +[a-z_0-9]+ +)([^ (][^,/]*), *([^ ][^/,]*), *([^ ][^/,;]*)([/;].*)*$/\1\5, \4, \3 \6/ + +# Fix up whitespace just in case + +s/ +,/,/ + +# Decorate literals with $ + +s/^(([a-z_0-9]+\:)* +[a-z_0-9]+ +)(([-~+*/()A-Z0-9]*(0x[a-zA-Z0-9]*)*)* *\,)/\1$\3/ + +# Translate relative addresses with uppercase base variable +# Turn defined offset fields into explicit indirections to match + +s/^([^/][^[]+)[[]([A-Z_0-9]+)[]]/\1\2/ +s/^([^/][^[]+)[[]([A-Z][A-Z_0-9]*) *\+ *([^]]+)[]]/\1\3\+\2/ + +s/^\#define *([a-z][a-z_0-9]*) *([a-z][a-z_0-9]*) *\+(.*)/\#define \1 \3\(\2\)/ + +# Translate relative addresses + +s/^([^/][^[]+)[[]([a-z_0-9]+)[]]/\1\(\2\)/ +s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *8\*([a-z][a-z_0-9]*) *\+ *([a-z_A-Z0-9]+)[]]/\1\4\(\2,\3,8\)/ +s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *([a-z][a-z_0-9]*) *\+ *([a-z_A-Z0-9]+)[]]/\1\4\(\2,\3,1\)/ +s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *8\*([a-z][a-z_0-9]*) *\- *([a-z_A-Z0-9]+)[]]/\1\-\4\(\2,\3,8\)/ +s/^([^/][^[]+)[[](rip) *\+ *([a-z_A-Z0-9* ]+)[]]/\1\3\(\2\)/ +s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *([A-Z0-9* ]+)[]]/\1\3\(\2\)/ +s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\- *([A-Z0-9* ]+)[]]/\1\-\3\(\2\)/ +s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *8\*([a-z][a-z_0-9]*)[]]/\1\(\2,\3,8\)/ +s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *4\*([a-z][a-z_0-9]*)[]]/\1\(\2,\3,4\)/ +s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *2\*([a-z][a-z_0-9]*)[]]/\1\(\2,\3,2\)/ +s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *([a-z][a-z_0-9]*)[]]/\1\(\2,\3\)/ +s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\+ *([^]]+)[]]/\1\3\(\2\)/ +s/^([^/][^[]+)[[]([a-z][a-z_0-9]*) *\- *([^]]+)[]]/\1-\3\(\2\)/ +s/^([^/][^[]+)[[]([^]]+)[]]/\1\(\2\)/ + +# Put % in front of register names + +s/ ax *$/ %ax/ +s/ ax,/ %ax,/ +s/ cl *$/ %cl/ +s/ cl,/ %cl,/ +s/([[(,.;: ])([re][abcd]x)/\1\%\2/g +s/([[(,.;: ])([re]sp)/\1\%\2/g +s/([[(,.;: ])([re]bp)/\1\%\2/g +s/([[(,.;: ])([re]si)/\1\%\2/g +s/([[(,.;: ])([re]di)/\1\%\2/g +s/([[(,.;: ])(r8d*)/\1\%\2/g +s/([[(,.;: ])(r9d*)/\1\%\2/g +s/([[(,.;: ])(r1[0-5]d*)/\1\%\2/g +s/([[(,.;: ])([re]ip)/\1\%\2/g + +# Add explicit sizes to instructions + +s/QWORD PTR//g + +s/ adc / adcq /g +s/ adcx / adcxq /g +s/ add / addq /g +s/ adox / adoxq /g +s/ and / andq /g +s/ bsf / bsfq /g +s/ bsr / bsrq /g +s/ bswap / bswapq /g +s/ bt / btq /g +s/ call / callq /g +s/ cmovae / cmovaeq /g +s/ cmovb / cmovbq /g +s/ cmovc / cmovcq /g +s/ cmove / cmoveq /g +s/ cmovnc / cmovncq /g +s/ cmovne / cmovneq /g +s/ cmovnz / cmovnzq /g +s/ cmovz / cmovzq /g +s/ cmp / cmpq /g +s/ dec / decq /g +s/ imul / imulq /g +s/ inc / incq /g +s/ lea / leaq /g +s/ mov / movq /g +s/ movabs / movabsq /g +s/ mul / mulq /g +s/ mulx / mulxq /g +s/ neg / negq /g +s/ not / notq /g +s/ or / orq /g +s/ pop / popq /g +s/ push / pushq /g +s/ sar / sarq /g +s/ sbb / sbbq /g +s/ shl / shlq /g +s/ shld / shldq /g +s/ shr / shrq /g +s/ shrd / shrdq /g +s/ sub / subq /g +s/ test / testq /g +s/ xor / xorq /g + +s/q( .*zeroe)/l\1/ +s/q( .*plus2e)/l\1/ +s/q( .*short)/l\1/ +s/q( .*%e)/l\1/ +s/q( .*%r[0-9]+d)/l\1/ +s/q( .*%ax)/w\1/ + +# Eliminate any trailing spaces, just to be tidy + +s/ +$// diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_add_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_add_p25519.S new file mode 100644 index 00000000000..b4c3f21fb78 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_add_p25519.S @@ -0,0 +1,103 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Add modulo p_25519, z := (x + y) mod p_25519, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_add_p25519 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p25519) + .text + +#define z %rdi +#define x %rsi +#define y %rdx + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +// These also re-use inputs x and y when safe to do so + +#define c0 %rax +#define c1 %rcx +#define c2 %rsi +#define c3 %rdx +#define c0short %eax +#define c1short %ecx +#define c2short %esi +#define c3short %edx + +S2N_BN_SYMBOL(bignum_add_p25519): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Add as [d3; d2; d1; d0] = x + y; since we assume x, y < 2^255 - 19 +// this sum fits in 256 bits. + + movq (x), d0 + addq (y), d0 + movq 8(x), d1 + adcq 8(y), d1 + movq 16(x), d2 + adcq 16(y), d2 + movq 24(x), d3 + adcq 24(y), d3 + +// Now x + y >= 2^255 - 19 <=> x + y + 19 >= 2^255. +// Form [c3; c2; c1; c0] = (x + y) + 19 + + movl $19, c0short + xorl c1short, c1short + xorl c2short, c2short + xorl c3short, c3short + + addq d0, c0 + adcq d1, c1 + adcq d2, c2 + adcq d3, c3 + +// Test the top bit to see if this is >= 2^255, and clear it as a masking +// so that in that case the result is exactly (x + y) - (2^255 - 19). +// Then select the output according to that top bit as that or just x + y. + + btr $63, c3 + cmovcq c0, d0 + cmovcq c1, d1 + cmovcq c2, d2 + cmovcq c3, d3 + +// Store the result + + movq d0, (z) + movq d1, 8(z) + movq d2, 16(z) + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_cmul_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_cmul_p25519.S new file mode 100644 index 00000000000..4eed06e8532 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_cmul_p25519.S @@ -0,0 +1,113 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming +// x reduced +// Inputs c, x[4]; output z[4] +// +// extern void bignum_cmul_p25519 +// (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = c, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p25519) + .text + +#define z %rdi + +// Temporarily moved here for initial multiply + +#define x %rcx +#define c %rcx +#define cshort %ecx + +// Used as a zero register after the initial move + +#define zero %rsi +#define zeroe %esi + +// Likewise this is thrown away after initial multiply + +#define d %rdx +#define a %rax +#define ashort %eax + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +S2N_BN_SYMBOL(bignum_cmul_p25519): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Shuffle inputs (since we want multiplier in %rdx) + + movq %rdx, x + movq %rsi, d + +// Multiply, accumulating the result as [d4;d3;d2;d1;d0] + + mulxq (x), d0, d1 + mulxq 8(x), a, d2 + addq a, d1 + mulxq 16(x), a, d3 + adcq a, d2 + mulxq 24(x), d, a + adcq d, d3 + adcq $0, a + +// Let [d4;d3;d2;d1;d0] = 2^255 * h + l, and use q = h + 1 as the initial +// quotient estimate, which is either right or 1 too big. + + shldq $1, d3, a + movl $19, cshort + incq a + bts $63, d3 + mulq c + xorl zeroe, zeroe + addq a, d0 + adcq d, d1 + adcq zero, d2 + adcq zero, d3 + +// Correct if CF = 0 by subtracting 19, either way masking to +// 255 bits, i.e. by effectively adding p_25519 to the "full" answer + + cmovcq zero, c + subq c, d0 + sbbq zero, d1 + sbbq zero, d2 + sbbq zero, d3 + btr $63, d3 + +// Write everything back + + movq d0, (z) + movq d1, 8(z) + movq d2, 16(z) + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_cmul_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_cmul_p25519_alt.S new file mode 100644 index 00000000000..e31805f2342 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_cmul_p25519_alt.S @@ -0,0 +1,127 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word modulo p_25519, z := (c * x) mod p_25519, assuming +// x reduced +// Inputs c, x[4]; output z[4] +// +// extern void bignum_cmul_p25519_alt +// (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = c, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p25519_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p25519_alt) + .text + +#define z %rdi + +// Temporarily moved here for initial multiply + +#define x %rcx + +// Used as a zero register after the initial move + +#define zero %rsi +#define zeroe %esi + +// Likewise this is thrown away after initial multiply + +#define d %rdx +#define a %rax +#define ashort %eax + +#define c %rcx +#define cshort %ecx + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 +#define d4 %rdx + +S2N_BN_SYMBOL(bignum_cmul_p25519_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Shuffle inputs (since we want %rdx for the high parts of products) + + movq %rdx, x + +// Multiply, accumulating the result as [d4;d3;d2;d1;d0] + + movq (x), a + mulq %rsi + movq a, d0 + movq d, d1 + + movq 8(x), a + xorq d2, d2 + mulq %rsi + addq a, d1 + adcq d, d2 + + movq 16(x), a + mulq %rsi + addq a, d2 + adcq $0, d + + movq 24(x), a + movq d, d3 + mulq %rsi + xorl zeroe, zeroe + addq a, d3 + adcq zero, d4 + +// Let [d4;d3;d2;d1;d0] = 2^255 * h + l, and use q = h + 1 as the initial +// quotient estimate, which is either right or 1 too big. + + shldq $1, d3, d4 + movl $19, cshort + leaq 1(d4), a + bts $63, d3 + mulq c + addq a, d0 + adcq d, d1 + adcq zero, d2 + adcq zero, d3 + +// Correct if CF = 0 by subtracting 19, either way masking to +// 255 bits, i.e. by effectively adding p_25519 to the "full" answer + + cmovcq zero, c + subq c, d0 + sbbq zero, d1 + sbbq zero, d2 + sbbq zero, d3 + btr $63, d3 + +// Write everything back + + movq d0, (z) + movq d1, 8(z) + movq d2, 16(z) + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_double_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_double_p25519.S new file mode 100644 index 00000000000..dec97c2c98c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_double_p25519.S @@ -0,0 +1,101 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Double modulo p_25519, z := (2 * x) mod p_25519, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_double_p25519 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_p25519) + .text + +#define z %rdi +#define x %rsi + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +// These also re-use input x when safe to do so + +#define c0 %rax +#define c1 %rcx +#define c2 %rsi +#define c3 %rdx +#define c0short %eax +#define c1short %ecx +#define c2short %esi +#define c3short %edx + +S2N_BN_SYMBOL(bignum_double_p25519): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Add as [d3; d2; d1; d0] = 2 * x; since we assume x < 2^255 - 19 +// this result fits in 256 bits. + + movq (x), d0 + addq d0, d0 + movq 8(x), d1 + adcq d1, d1 + movq 16(x), d2 + adcq d2, d2 + movq 24(x), d3 + adcq d3, d3 + +// Now 2 * x >= 2^255 - 19 <=> 2 * x + 19 >= 2^255. +// Form [c3; c2; c1; c0] = (2 * x) + 19 + + movl $19, c0short + xorl c1short, c1short + xorl c2short, c2short + xorl c3short, c3short + + addq d0, c0 + adcq d1, c1 + adcq d2, c2 + adcq d3, c3 + +// Test the top bit to see if this is >= 2^255, and clear it as a masking +// so that in that case the result is exactly (2 * x) - (2^255 - 19). +// Then select the output according to that top bit as that or just 2 * x. + + btr $63, c3 + cmovcq c0, d0 + cmovcq c1, d1 + cmovcq c2, d2 + cmovcq c3, d3 + +// Store the result + + movq d0, (z) + movq d1, 8(z) + movq d2, 16(z) + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_inv_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_inv_p25519.S new file mode 100644 index 00000000000..f83974a21f4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_inv_p25519.S @@ -0,0 +1,1587 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular inverse modulo p_25519 = 2^255 - 19 +// Input x[4]; output z[4] +// +// extern void bignum_inv_p25519(uint64_t z[static 4],uint64_t x[static 4]); +// +// Assuming the 4-digit input x is coprime to p_25519, i.e. is not divisible +// by it, returns z < p_25519 such that x * z == 1 (mod p_25519). The input +// x does not need to be reduced modulo p_25519, but the output always is. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p25519) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define f 0(%rsp) +#define g (4*N)(%rsp) +#define u (8*N)(%rsp) +#define v (12*N)(%rsp) +#define tmp (16*N)(%rsp) +#define tmp2 (17*N)(%rsp) +#define i (18*N)(%rsp) +#define d (19*N)(%rsp) + +#define mat (20*N)(%rsp) + +// Backup for the input pointer + +#define res (24*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (26*N) + +// Syntactic variants to make x86_att version simpler to generate + +#define F 0 +#define G (4*N) +#define U (8*N) +#define V (12*N) +#define MAT (20*N) + +#define ff (%rsp) +#define gg (4*N)(%rsp) + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix as +// +// [ %r8 %r10] +// [ %r12 %r14] +// +// and also returning the matrix still negated (which doesn't matter) + +#define divstep59(din,fin,gin) \ + movq din, %rsi ; \ + movq fin, %rdx ; \ + movq gin, %rcx ; \ + movq %rdx, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + xorl %ebp, %ebp ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %rdx ; \ + leaq (%rcx,%rax), %rdi ; \ + shlq $0x16, %rdx ; \ + shlq $0x16, %rdi ; \ + sarq $0x2b, %rdx ; \ + sarq $0x2b, %rdi ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %rbx ; \ + leaq (%rcx,%rax), %rcx ; \ + sarq $0x2a, %rbx ; \ + sarq $0x2a, %rcx ; \ + movq %rdx, MAT(%rsp) ; \ + movq %rbx, MAT+0x8(%rsp) ; \ + movq %rdi, MAT+0x10(%rsp) ; \ + movq %rcx, MAT+0x18(%rsp) ; \ + movq fin, %r12 ; \ + imulq %r12, %rdi ; \ + imulq %rdx, %r12 ; \ + movq gin, %r13 ; \ + imulq %r13, %rbx ; \ + imulq %rcx, %r13 ; \ + addq %rbx, %r12 ; \ + addq %rdi, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r10 ; \ + shlq $0x16, %r8 ; \ + shlq $0x16, %r10 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r10 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r15 ; \ + leaq (%rcx,%rax), %r11 ; \ + sarq $0x2a, %r15 ; \ + sarq $0x2a, %r11 ; \ + movq %r13, %rbx ; \ + movq %r12, %rcx ; \ + imulq %r8, %r12 ; \ + imulq %r15, %rbx ; \ + addq %rbx, %r12 ; \ + imulq %r11, %r13 ; \ + imulq %r10, %rcx ; \ + addq %rcx, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq MAT(%rsp), %rax ; \ + imulq %r8, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r15, %rdx ; \ + imulq MAT+0x8(%rsp), %r8 ; \ + imulq MAT+0x18(%rsp), %r15 ; \ + addq %r8, %r15 ; \ + leaq (%rax,%rdx), %r9 ; \ + movq MAT(%rsp), %rax ; \ + imulq %r10, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r11, %rdx ; \ + imulq MAT+0x8(%rsp), %r10 ; \ + imulq MAT+0x18(%rsp), %r11 ; \ + addq %r10, %r11 ; \ + leaq (%rax,%rdx), %r13 ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r12 ; \ + shlq $0x15, %r8 ; \ + shlq $0x15, %r12 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r12 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r10 ; \ + leaq (%rcx,%rax), %r14 ; \ + sarq $0x2b, %r10 ; \ + sarq $0x2b, %r14 ; \ + movq %r9, %rax ; \ + imulq %r8, %rax ; \ + movq %r13, %rdx ; \ + imulq %r10, %rdx ; \ + imulq %r15, %r8 ; \ + imulq %r11, %r10 ; \ + addq %r8, %r10 ; \ + leaq (%rax,%rdx), %r8 ; \ + movq %r9, %rax ; \ + imulq %r12, %rax ; \ + movq %r13, %rdx ; \ + imulq %r14, %rdx ; \ + imulq %r15, %r12 ; \ + imulq %r11, %r14 ; \ + addq %r12, %r14 ; \ + leaq (%rax,%rdx), %r12 + +S2N_BN_SYMBOL(bignum_inv_p25519): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Copy the input and the prime into the main f and g variables. +// Make sure x is reduced so that g <= f as assumed in the bound proof. + + xorl %eax, %eax + leaq -19(%rax), %rcx + notq %rax + movq %rcx, F(%rsp) + movq %rax, F+8(%rsp) + movq %rax, F+16(%rsp) + btr $63, %rax + movq %rax, F+24(%rsp) + + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 + movl $0x1, %eax + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $19, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, G(%rsp) + movq %rcx, G+0x8(%rsp) + movq %r8, G+0x10(%rsp) + movq %r9, G+0x18(%rsp) + +// Also maintain weakly reduced < 2*p_25519 vector [u,v] such that +// [f,g] == x * 2^{590-59*i} * [u,v] (mod p_25519) +// starting with [p_25519,x] == x * 2^{590-59*0} * [0,2^-590] (mod p_25519) + + xorl %eax, %eax + movq %rax, U(%rsp) + movq %rax, U+8(%rsp) + movq %rax, U+16(%rsp) + movq %rax, U+24(%rsp) + + movq $0xa0f99e2375022099, %rax + movq %rax, V(%rsp) + movq $0xa8c68f3f1d132595, %rax + movq %rax, V+8(%rsp) + movq $0x6c6c893805ac5242, %rax + movq %rax, V+16(%rsp) + movq $0x276508b241770615, %rax + movq %rax, V+24(%rsp) + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special tenth iteration after a uniform +// first 9. + + movq $10, i + movq $1, d + jmp bignum_inv_p25519_midloop + +bignum_inv_p25519_loop: + +// Separate out the matrix into sign-magnitude pairs + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in temporary storage for the [u,v] part and do [f,g] first. + + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, tmp + + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, tmp2 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + xorl %ebx, %ebx + movq F(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq G(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq F(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq G(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + +// Digit 1 of [f,g] + + xorl %ecx, %ecx + movq F+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq G+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $59, %rbx, %rdi + movq %rdi, F(%rsp) + + xorl %edi, %edi + movq F+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq G+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $59, %rbp, %rsi + movq %rsi, G(%rsp) + +// Digit 2 of [f,g] + + xorl %esi, %esi + movq F+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq G+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $59, %rcx, %rbx + movq %rbx, F+N(%rsp) + + xorl %ebx, %ebx + movq F+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq G+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $59, %rdi, %rbp + movq %rbp, G+N(%rsp) + +// Digits 3 and 4 of [f,g] + + movq F+3*N(%rsp), %rax + xorq %r9, %rax + movq %rax, %rbp + sarq $63, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq G+3*N(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $63, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $59, %rsi, %rcx + movq %rcx, F+2*N(%rsp) + shrdq $59, %rbp, %rsi + + movq F+3*N(%rsp), %rax + movq %rsi, F+3*N(%rsp) + + xorq %r13, %rax + movq %rax, %rsi + sarq $63, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq G+3*N(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $63, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $59, %rbx, %rdi + movq %rdi, G+2*N(%rsp) + shrdq $59, %rsi, %rbx + movq %rbx, G+3*N(%rsp) + +// Get the initial carries back from storage and do the [u,v] accumulation + + movq tmp, %rbx + movq tmp2, %rbp + +// Digit 0 of [u,v] + + xorl %ecx, %ecx + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V(%rsp) + +// Digit 1 of [u,v] + + xorl %ebx, %ebx + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+N(%rsp) + +// Digit 2 of [u,v] + + xorl %ecx, %ecx + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U+2*N(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V+2*N(%rsp) + +// Digits 3 and 4 of u (top is unsigned) + + movq U+3*N(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+3*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + +// Modular reduction of u + + movq %rdx, %rbx + shldq $1, %rcx, %rdx + sarq $63, %rbx + addq %rbx, %rdx + movl $19, %eax + imulq %rdx + movq U(%rsp), %r8 + addq %rax, %r8 + movq %r8, U(%rsp) + movq U+N(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, U+N(%rsp) + movq U+2*N(%rsp), %r8 + adcq %rbx, %r8 + movq %r8, U+2*N(%rsp) + adcq %rbx, %rcx + shlq $63, %rax + addq %rax, %rcx + +// Preload for last use of old u digit 3 + + movq U+3*N(%rsp), %rax + movq %rcx, U+3*N(%rsp) + +// Digits 3 and 4 of v (top is unsigned) + + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq V+3*N(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + +// Modular reduction of v + + movq %rdx, %rcx + shldq $1, %rsi, %rdx + sarq $63, %rcx + movl $19, %eax + addq %rcx, %rdx + imulq %rdx + movq V(%rsp), %r8 + addq %rax, %r8 + movq %r8, V(%rsp) + movq V+N(%rsp), %r8 + adcq %rdx, %r8 + movq %r8, V+N(%rsp) + movq V+2*N(%rsp), %r8 + adcq %rcx, %r8 + movq %r8, V+2*N(%rsp) + adcq %rcx, %rsi + shlq $63, %rax + addq %rax, %rsi + movq %rsi, V+3*N(%rsp) + +bignum_inv_p25519_midloop: + + divstep59(d,ff,gg) + movq %rsi, d + +// Next iteration + + decq i + jnz bignum_inv_p25519_loop + +// The 10th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + movq F(%rsp), %rax + movq G(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $63, %rax + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * [u,v] (mod p_25519) +// we want to flip the sign of u according to that of f. + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + +// Adjust the initial value to allow for complement instead of negation + + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + +// Digit 0 of [u] + + xorl %r13d, %r13d + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + +// Digit 1 of [u] + + xorl %r14d, %r14d + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + +// Digit 2 of [u] + + xorl %r15d, %r15d + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + +// Digits 3 and 4 of u (top is unsigned) + + movq U+3*N(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq V+3*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + +// Modular reduction of u, this time strictly 2^255-19. + + movq %r9, %rax + shldq $1, %r15, %rax + sarq $63, %r9 + movl $19, %ebx + leaq 1(%rax,%r9,1), %rax + imulq %rbx + xorl %ebp, %ebp + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r9, %r14 + adcq %r9, %r15 + shlq $63, %rax + addq %rax, %r15 + cmovns %rbp, %rbx + subq %rbx, %r12 + sbbq %rbp, %r13 + sbbq %rbp, %r14 + sbbq %rbp, %r15 + btr $0x3f, %r15 + +// Store it back to the final output + + movq res, %rdi + movq %r12, (%rdi) + movq %r13, N(%rdi) + movq %r14, 2*N(%rdi) + movq %r15, 3*N(%rdi) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_invsqrt_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_invsqrt_p25519.S new file mode 100644 index 00000000000..4c85691c433 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_invsqrt_p25519.S @@ -0,0 +1,594 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Inverse square root modulo p_25519 = 2^255 - 19 +// Input x[4]; output function return (Legendre symbol) and z[4] +// +// extern int64_t bignum_invsqrt_p25519(uint64_t z[static 4],uint64_t x[static 4]); +// +// Given a 4-digit input x, returns a modular inverse square root mod p_25519, +// i.e. a z such that x * z^2 == 1 (mod p_25519), whenever one exists. The +// inverse square root z is chosen so that its LSB is even (note that p_25519-z +// is another possibility). The function return is the Legendre/Jacobi symbol +// (x//p_25519), which indicates whether indeed x has a modular inverse square +// root and hence whether the result is meaningful: +// +// 0: x is divisible by p_25519 so trivially there is no inverse square root +// +1: x is coprime to p_25519 and z is indeed an inverse square root +// -1: x is coprime to p_25519 but there is no (inverse or direct) square root +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_invsqrt_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_invsqrt_p25519) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define a 0(%rsp) +#define b (4*N)(%rsp) +#define s (8*N)(%rsp) +#define t (12*N)(%rsp) +#define u (16*N)(%rsp) +#define res (20*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (22*N) + +// Corrupted versions when stack is down 8 more + +#define u8 (17*N)(%rsp) + +// Syntactic variants to make x86_att version simpler to generate + +#define A 0 +#define B (4*N) +#define S (8*N) +#define T (12*N) +#define U (16*N) +#define U8 (17*N) + +S2N_BN_SYMBOL(bignum_invsqrt_p25519): + _CET_ENDBR + +// In this case the Windows form literally makes a subroutine call. +// This avoids hassle arising from subroutine offsets + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + callq bignum_invsqrt_p25519_standard + popq %rsi + popq %rdi + ret + +bignum_invsqrt_p25519_standard: +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Set up reduced version of the input argument a = x mod p_25519. Then +// get the candidate inverse square root s = a^{252-3} + + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 + movl $0x1, %eax + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, A(%rsp) + movq %rcx, A+0x8(%rsp) + movq %r8, A+0x10(%rsp) + movq %r9, A+0x18(%rsp) + + // Power 2^2 - 1 = 3 + + leaq T(%rsp), %rdi + movq $1, %rsi + leaq A(%rsp), %rdx + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq T(%rsp), %rsi + leaq A(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + + // Power 2^4 - 1 = 15 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + + // Power 2^5 - 1 = 31 + + leaq S(%rsp), %rdi + movq $1, %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq A(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + + // Power 2^10 - 1 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + + // Power 2^20 - 1 + + leaq S(%rsp), %rdi + movq $10, %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + + // Power 2^25 - 1 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + + // Power 2^50 - 1 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + + // Power 2^100 - 1 + + leaq S(%rsp), %rdi + movq $50, %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + + // Power 2^125 - 1 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + + // Power 2^250 - 1 + + leaq S(%rsp), %rdi + movq $125, %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + + // Power 2^252 - 3 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq S(%rsp), %rdi + leaq S(%rsp), %rsi + leaq A(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + +// s = a^{2^252-3} is now one candidate inverse square root. +// Generate the other one t = s * j_25519 where j_25519 = sqrt(-1) + + movq $0xc4ee1b274a0ea0b0, %rax + movq %rax, T(%rsp) + movq $0x2f431806ad2fe478, %rax + movq %rax, T+8(%rsp) + movq $0x2b4d00993dfbd7a7, %rax + movq %rax, T+16(%rsp) + movq $0x2b8324804fc1df0b, %rax + movq %rax, T+24(%rsp) + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + +// Now multiplex between them according to whether a * s^2 = 1 + + leaq B(%rsp), %rdi + movq $1, %rsi + leaq S(%rsp), %rdx + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq A(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + + movq B(%rsp), %rax + xorq $1, %rax + movq B+8(%rsp), %rbx + orq %rbx, %rax + movq B+16(%rsp), %rcx + movq B+24(%rsp), %rdx + orq %rdx, %rcx + orq %rcx, %rax + + movq S(%rsp), %rax + movq T(%rsp), %rbx + cmovnzq %rbx, %rax + movq S+8(%rsp), %rbx + movq T+8(%rsp), %rcx + cmovnzq %rcx, %rbx + movq S+16(%rsp), %rcx + movq T+16(%rsp), %rdx + cmovnzq %rdx, %rcx + movq S+24(%rsp), %rbp + movq T+24(%rsp), %rdx + cmovnzq %rdx, %rbp + +// For definiteness, choose "positive" (LSB=0) inverse square root + + xorl %edx, %edx + leaq -19(%rdx), %r8 + leaq -1(%rdx), %r11 + movq %r11, %r9 + movq %r11, %r10 + btr $63, %r11 + + subq %rax, %r8 + sbbq %rbx, %r9 + sbbq %rcx, %r10 + sbbq %rbp, %r11 + + movq res, %rdx + testq $1, %rax + cmovnzq %r8, %rax + movq %rax, (%rdx) + cmovnzq %r9, %rbx + movq %rbx, 8(%rdx) + cmovnzq %r10, %rcx + movq %rcx, 16(%rdx) + cmovnzq %r11, %rbp + movq %rbp, 24(%rdx) + +// Determine if it is is indeed an inverse square root, also distinguishing +// the degenerate x * z^2 == 0 (mod p_25519) case, which is equivalent to +// x == 0 (mod p_25519). Hence return the Legendre-Jacobi symbol as required. + + leaq B(%rsp), %rdi + movq $1, %rsi + callq bignum_invsqrt_p25519_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq A(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_mul_p25519 + + movq $1, %rax + movq B(%rsp), %rbp + xorq %rbp, %rax + movq B+8(%rsp), %rbx + orq %rbx, %rax + orq %rbx, %rbp + movq B+16(%rsp), %rcx + movq B+24(%rsp), %rdx + orq %rdx, %rcx + orq %rcx, %rax + orq %rcx, %rbp + + negq %rax + sbbq %rax, %rax + leaq 1(%rax,%rax,1), %rax + + testq %rbp, %rbp + cmovzq %rbp, %rax + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +bignum_invsqrt_p25519_mul_p25519: + movq %rdx, %rcx + xorl %ebp, %ebp + movq (%rcx), %rdx + mulxq (%rsi), %r8, %r9 + mulxq 0x8(%rsi), %rax, %r10 + addq %rax, %r9 + mulxq 0x10(%rsi), %rax, %r11 + adcq %rax, %r10 + mulxq 0x18(%rsi), %rax, %r12 + adcq %rax, %r11 + adcq %rbp, %r12 + xorl %ebp, %ebp + movq 0x8(%rcx), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + adcq %rbp, %r13 + xorl %ebp, %ebp + movq 0x10(%rcx), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + movq 0x18(%rcx), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rcx, %r15 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movl $0x26, %edx + mulxq %r15, %rax, %rbx + adcxq %rcx, %r14 + adoxq %rbp, %r15 + adcq %rbp, %r15 + addq %r11, %rax + adcq %rbp, %rbx + btq $0x3f, %rax + adcq %rbx, %rbx + leaq 0x1(%rbx), %rcx + imulq $0x13, %rcx, %rcx + xorl %ebp, %ebp + adoxq %rcx, %r8 + mulxq %r12, %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq %r13, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r14, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq %r15, %rax, %rbx + adcq %rax, %r11 + shlq $0x3f, %rcx + cmpq %rcx, %r11 + movl $0x13, %eax + cmovns %rbp, %rax + subq %rax, %r8 + sbbq %rbp, %r9 + sbbq %rbp, %r10 + sbbq %rbp, %r11 + btr $0x3f, %r11 + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +bignum_invsqrt_p25519_nsqr_p25519: + +// Copy input argument into u + + movq (%rdx), %rax + movq 8(%rdx), %rbx + movq 16(%rdx), %rcx + movq 24(%rdx), %rdx + movq %rax, U8(%rsp) + movq %rbx, U8+8(%rsp) + movq %rcx, U8+16(%rsp) + movq %rdx, U8+24(%rsp) + +// Main squaring loop, accumulating in u consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +bignum_invsqrt_p25519_loop: + movq U8(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq U8+0x8(%rsp), %r9, %r10 + mulxq U8+0x18(%rsp), %r11, %r12 + movq U8+0x10(%rsp), %rdx + mulxq U8+0x18(%rsp), %r13, %r14 + xorl %ebx, %ebx + mulxq U8(%rsp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq U8+0x8(%rsp), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + movq U8+0x18(%rsp), %rdx + mulxq U8+0x8(%rsp), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rbx, %r13 + adoxq %rbx, %r14 + adcq %rbx, %r14 + xorl %ebx, %ebx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq U8+0x8(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq U8+0x10(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq U8+0x18(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbx, %r15 + adoxq %rbx, %r15 + movl $0x26, %edx + xorl %ebx, %ebx + mulxq %r12, %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq %r13, %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq %r14, %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq %r15, %rax, %r12 + adcxq %rax, %r11 + adoxq %rbx, %r12 + adcxq %rbx, %r12 + shldq $0x1, %r11, %r12 + btr $0x3f, %r11 + movl $0x13, %edx + imulq %r12, %rdx + addq %rdx, %r8 + adcq %rbx, %r9 + adcq %rbx, %r10 + adcq %rbx, %r11 + movq %r8, U8(%rsp) + movq %r9, U8+0x8(%rsp) + movq %r10, U8+0x10(%rsp) + movq %r11, U8+0x18(%rsp) + +// Loop as applicable + + decq %rsi + jnz bignum_invsqrt_p25519_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "ns" condition. We just use the results where +// they were in registers [%r11;%r10;%r9;%r8] instead of re-loading them. + + movl $19, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovns %r8, %rax + cmovns %r9, %rbx + cmovns %r10, %rcx + cmovns %r11, %rdx + btr $63, %rdx + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %rcx, 16(%rdi) + movq %rdx, 24(%rdi) + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_invsqrt_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_invsqrt_p25519_alt.S new file mode 100644 index 00000000000..78bcb3e5cfb --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_invsqrt_p25519_alt.S @@ -0,0 +1,675 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Inverse square root modulo p_25519 = 2^255 - 19 +// Input x[4]; output function return (Legendre symbol) and z[4] +// +// extern int64_t bignum_invsqrt_p25519_alt(uint64_t z[static 4],uint64_t x[static 4]); +// +// Given a 4-digit input x, returns a modular inverse square root mod p_25519, +// i.e. a z such that x * z^2 == 1 (mod p_25519), whenever one exists. The +// inverse square root z is chosen so that its LSB is even (note that p_25519-z +// is another possibility). The function return is the Legendre/Jacobi symbol +// (x//p_25519), which indicates whether indeed x has a modular inverse square +// root and hence whether the result is meaningful: +// +// 0: x is divisible by p_25519 so trivially there is no inverse square root +// +1: x is coprime to p_25519 and z is indeed an inverse square root +// -1: x is coprime to p_25519 but there is no (inverse or direct) square root +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_invsqrt_p25519_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_invsqrt_p25519_alt) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define a 0(%rsp) +#define b (4*N)(%rsp) +#define s (8*N)(%rsp) +#define t (12*N)(%rsp) +#define u (16*N)(%rsp) +#define res (20*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (22*N) + +// Corrupted versions when stack is down 8 more + +#define u8 (17*N)(%rsp) + +// Syntactic variants to make x86_att version simpler to generate + +#define A 0 +#define B (4*N) +#define S (8*N) +#define T (12*N) +#define U (16*N) +#define U8 (17*N) + +S2N_BN_SYMBOL(bignum_invsqrt_p25519_alt): + _CET_ENDBR + +// In this case the Windows form literally makes a subroutine call. +// This avoids hassle arising from subroutine offsets + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + callq bignum_invsqrt_p25519_alt_standard + popq %rsi + popq %rdi + ret + +bignum_invsqrt_p25519_alt_standard: +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Set up reduced version of the input argument a = x mod p_25519. Then +// get the candidate inverse square root s = a^{252-3} + + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 + movl $0x1, %eax + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, A(%rsp) + movq %rcx, A+0x8(%rsp) + movq %r8, A+0x10(%rsp) + movq %r9, A+0x18(%rsp) + + // Power 2^2 - 1 = 3 + + leaq T(%rsp), %rdi + movq $1, %rsi + leaq A(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq T(%rsp), %rsi + leaq A(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + + // Power 2^4 - 1 = 15 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + + // Power 2^5 - 1 = 31 + + leaq S(%rsp), %rdi + movq $1, %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq A(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + + // Power 2^10 - 1 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + + // Power 2^20 - 1 + + leaq S(%rsp), %rdi + movq $10, %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + + // Power 2^25 - 1 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + + // Power 2^50 - 1 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + + // Power 2^100 - 1 + + leaq S(%rsp), %rdi + movq $50, %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + + // Power 2^125 - 1 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + + // Power 2^250 - 1 + + leaq S(%rsp), %rdi + movq $125, %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + + // Power 2^252 - 3 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq S(%rsp), %rdi + leaq S(%rsp), %rsi + leaq A(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + +// s = a^{2^252-3} is now one candidate inverse square root. +// Generate the other one t = s * j_25519 where j_25519 = sqrt(-1) + + movq $0xc4ee1b274a0ea0b0, %rax + movq %rax, T(%rsp) + movq $0x2f431806ad2fe478, %rax + movq %rax, T+8(%rsp) + movq $0x2b4d00993dfbd7a7, %rax + movq %rax, T+16(%rsp) + movq $0x2b8324804fc1df0b, %rax + movq %rax, T+24(%rsp) + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + +// Now multiplex between them according to whether a * s^2 = 1 + + leaq B(%rsp), %rdi + movq $1, %rsi + leaq S(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq A(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + + movq B(%rsp), %rax + xorq $1, %rax + movq B+8(%rsp), %rbx + orq %rbx, %rax + movq B+16(%rsp), %rcx + movq B+24(%rsp), %rdx + orq %rdx, %rcx + orq %rcx, %rax + + movq S(%rsp), %rax + movq T(%rsp), %rbx + cmovnzq %rbx, %rax + movq S+8(%rsp), %rbx + movq T+8(%rsp), %rcx + cmovnzq %rcx, %rbx + movq S+16(%rsp), %rcx + movq T+16(%rsp), %rdx + cmovnzq %rdx, %rcx + movq S+24(%rsp), %rbp + movq T+24(%rsp), %rdx + cmovnzq %rdx, %rbp + +// For definiteness, choose "positive" (LSB=0) inverse square root + + xorl %edx, %edx + leaq -19(%rdx), %r8 + leaq -1(%rdx), %r11 + movq %r11, %r9 + movq %r11, %r10 + btr $63, %r11 + + subq %rax, %r8 + sbbq %rbx, %r9 + sbbq %rcx, %r10 + sbbq %rbp, %r11 + + movq res, %rdx + testq $1, %rax + cmovnzq %r8, %rax + movq %rax, (%rdx) + cmovnzq %r9, %rbx + movq %rbx, 8(%rdx) + cmovnzq %r10, %rcx + movq %rcx, 16(%rdx) + cmovnzq %r11, %rbp + movq %rbp, 24(%rdx) + +// Determine if it is is indeed an inverse square root, also distinguishing +// the degenerate x * z^2 == 0 (mod p_25519) case, which is equivalent to +// x == 0 (mod p_25519). Hence return the Legendre-Jacobi symbol as required. + + leaq B(%rsp), %rdi + movq $1, %rsi + callq bignum_invsqrt_p25519_alt_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq A(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_invsqrt_p25519_alt_mul_p25519 + + movq $1, %rax + movq B(%rsp), %rbp + xorq %rbp, %rax + movq B+8(%rsp), %rbx + orq %rbx, %rax + orq %rbx, %rbp + movq B+16(%rsp), %rcx + movq B+24(%rsp), %rdx + orq %rdx, %rcx + orq %rcx, %rax + orq %rcx, %rbp + + negq %rax + sbbq %rax, %rax + leaq 1(%rax,%rax,1), %rax + + testq %rbp, %rbp + cmovzq %rbp, %rax + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +bignum_invsqrt_p25519_alt_mul_p25519: + movq %rdx, %rcx + movq (%rsi), %rax + mulq (%rcx) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq (%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x8(%rsi), %rax + mulq (%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + xorq %r12, %r12 + movq (%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x8(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x10(%rsi), %rax + mulq (%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq (%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x8(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x10(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x18(%rsi), %rax + mulq (%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x8(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x18(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x10(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x18(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + movl $0x26, %esi + movq %r12, %rax + mulq %rsi + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rcx, %rcx + movq %r13, %rax + mulq %rsi + subq %rcx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r14, %rax + mulq %rsi + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq %r15, %rax + mulq %rsi + subq %rcx, %rdx + xorq %rcx, %rcx + addq %rax, %r11 + movq %rdx, %r12 + adcq %rcx, %r12 + shldq $0x1, %r11, %r12 + leaq 0x1(%r12), %rax + movl $0x13, %esi + bts $0x3f, %r11 + imulq %rsi, %rax + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rcx, %r11 + sbbq %rax, %rax + notq %rax + andq %rsi, %rax + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rcx, %r11 + btr $0x3f, %r11 + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +bignum_invsqrt_p25519_alt_nsqr_p25519: + +// Copy input argument into u + + movq (%rdx), %rax + movq 8(%rdx), %rbx + movq 16(%rdx), %rcx + movq 24(%rdx), %rdx + movq %rax, U8(%rsp) + movq %rbx, U8+8(%rsp) + movq %rcx, U8+16(%rsp) + movq %rdx, U8+24(%rsp) + +// Main squaring loop, accumulating in u consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +bignum_invsqrt_p25519_alt_loop: + movq U8(%rsp), %rax + mulq %rax + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq U8(%rsp), %rax + mulq U8+0x8(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + xorq %r12, %r12 + movq U8+0x8(%rsp), %rax + mulq %rax + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq U8(%rsp), %rax + mulq U8+0x10(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq U8(%rsp), %rax + mulq U8+0x18(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq U8+0x8(%rsp), %rax + mulq U8+0x10(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq U8+0x8(%rsp), %rax + mulq U8+0x18(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq U8+0x10(%rsp), %rax + mulq %rax + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq U8+0x10(%rsp), %rax + mulq U8+0x18(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq U8+0x18(%rsp), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + movl $0x26, %ebx + movq %r12, %rax + mulq %rbx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rcx, %rcx + movq %r13, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r14, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq %r15, %rax + mulq %rbx + subq %rcx, %rdx + xorq %rcx, %rcx + addq %rax, %r11 + movq %rdx, %r12 + adcq %rcx, %r12 + shldq $0x1, %r11, %r12 + btr $0x3f, %r11 + movl $0x13, %edx + imulq %r12, %rdx + addq %rdx, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rcx, %r11 + movq %r8, U8(%rsp) + movq %r9, U8+0x8(%rsp) + movq %r10, U8+0x10(%rsp) + movq %r11, U8+0x18(%rsp) + +// Loop as applicable + + decq %rsi + jnz bignum_invsqrt_p25519_alt_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "ns" condition. We just use the results where +// they were in registers [%r11;%r10;%r9;%r8] instead of re-loading them. + + movl $19, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovns %r8, %rax + cmovns %r9, %rbx + cmovns %r10, %rcx + cmovns %r11, %rdx + btr $63, %rdx + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %rcx, 16(%rdi) + movq %rdx, 24(%rdi) + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/curve25519/bignum_madd_n25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_madd_n25519.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/bignum_madd_n25519.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_madd_n25519.S index 5ec8de2de23..7d5282521fd 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/bignum_madd_n25519.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_madd_n25519.S @@ -95,6 +95,7 @@ adcq %rbx, m3 S2N_BN_SYMBOL(bignum_madd_n25519): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/curve25519/bignum_madd_n25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_madd_n25519_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/bignum_madd_n25519_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_madd_n25519_alt.S index f264b79c29e..5abdd1377f3 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/bignum_madd_n25519_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_madd_n25519_alt.S @@ -95,6 +95,7 @@ adcq %rbx, m3 S2N_BN_SYMBOL(bignum_madd_n25519_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_m25519_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_m25519_4.S new file mode 100644 index 00000000000..72aa689aa72 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_m25519_4.S @@ -0,0 +1,96 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod m_25519 +// Input x[4]; output z[4] +// +// extern void bignum_mod_m25519_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Reduction is modulo the group order of curve25519/edwards25519. +// This is the full group order, 8 * the standard basepoint order. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_m25519_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_m25519_4) + .text + +#define z %rdi +#define x %rsi + +#define d0 %rdx +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n0 %rax +#define n1 %r10 +#define n3 %r11 + +// Can re-use this as a temporary once we've loaded the input + +#define c %rsi + +S2N_BN_SYMBOL(bignum_mod_m25519_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load a set of registers [n3; 0; n1; n0] = m_25519 + + movq $0xc09318d2e7ae9f68, n0 + movq $0xa6f7cef517bce6b2, n1 + movq $0x8000000000000000, n3 + +// Load the input and compute x - m_25519 + + movq (x), d0 + subq n0, d0 + movq 8(x), d1 + sbbq n1, d1 + movq 16(x), d2 + sbbq $0, d2 + movq 24(x), d3 + sbbq n3, d3 + +// Now CF is set iff x < m_25519. Create a mask for that condition and mask +// the three nontrivial digits ready to undo the previous subtraction with +// a compensating addition + + sbbq c, c + andq c, n0 + andq c, n1 + andq c, n3 + +// Now add mask * m_25519 again and store + + addq n0, d0 + movq d0, (z) + adcq n1, d1 + movq d1, 8(z) + adcq $0, d2 + movq d2, 16(z) + adcq n3, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/curve25519/bignum_mod_n25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_n25519.S similarity index 93% rename from third_party/s2n-bignum/x86_att/curve25519/bignum_mod_n25519.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_n25519.S index 52c45899543..7d402e66919 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/bignum_mod_n25519.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_n25519.S @@ -35,6 +35,7 @@ #define q %rbx S2N_BN_SYMBOL(bignum_mod_n25519): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi @@ -53,7 +54,7 @@ S2N_BN_SYMBOL(bignum_mod_n25519): // If the input is already <= 3 words long, go to a trivial "copy" path cmpq $4, k - jc shortinput + jc bignum_mod_n25519_shortinput // Otherwise load the top 4 digits (top-down) and reduce k by 4 // This [m3;m2;m1;m0] is the initial x where we begin reduction. @@ -119,7 +120,7 @@ S2N_BN_SYMBOL(bignum_mod_n25519): // estimation process. testq k, k - jz writeback + jz bignum_mod_n25519_writeback bignum_mod_n25519_loop: @@ -187,7 +188,7 @@ bignum_mod_n25519_loop: // Write back -writeback: +bignum_mod_n25519_writeback: movq m0, (z) movq m1, 8(z) @@ -205,7 +206,7 @@ writeback: #endif ret -shortinput: +bignum_mod_n25519_shortinput: xorq m0, m0 xorq m1, m1 @@ -213,15 +214,15 @@ shortinput: xorq m3, m3 testq k, k - jz writeback + jz bignum_mod_n25519_writeback movq (%rdx), m0 decq k - jz writeback + jz bignum_mod_n25519_writeback movq 8(%rdx), m1 decq k - jz writeback + jz bignum_mod_n25519_writeback movq 16(%rdx), m2 - jmp writeback + jmp bignum_mod_n25519_writeback #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_n25519_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_n25519_4.S new file mode 100644 index 00000000000..11ca27133a3 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_n25519_4.S @@ -0,0 +1,109 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo basepoint order, z := x mod n_25519 +// Input x[4]; output z[4] +// +// extern void bignum_mod_n25519_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Reduction is modulo the order of the curve25519/edwards25519 basepoint. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n25519_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n25519_4) + .text + +#define z %rdi +#define x %rsi + +#define q %rcx + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +S2N_BN_SYMBOL(bignum_mod_n25519_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the top digit first, get the quotient estimate q = floor(x/2^252) +// and delete it from that digit, in effect doing x' = x - q * 2^252. +// Now we only need x' - q * n_25519' where n_25519' = n_25519 - 2^252 + + movq 24(x), q + movq q, d3 + shrq $60, q + shlq $4, d3 + shrq $4, d3 + +// Compute [%rdx;d2;d1] = q * n_25519' + + movq $0x5812631a5cf5d3ed, %rax + mulq q + movq %rax, d1 + movq %rdx, d2 + + movq $0x14def9dea2f79cd6, %rax + mulq q + addq %rax, d2 + adcq $0, %rdx + +// Subtract to get [d3;d2;d1;d0] = x - q * n_25519 + + movq (x), d0 + subq d1, d0 + movq 8(x), d1 + sbbq d2, d1 + movq 16(x), d2 + sbbq %rdx, d2 + sbbq $0, d3 + +// Get a bitmask q for the borrow and create masked version of +// non-trivial digits of [%rcx;0;%rdx;%rax] = n_25519. Note that +// %rcx = q but we can get it from the corresponding bit of %rax. + + sbbq q, q + + movq $0x5812631a5cf5d3ed, %rax + andq q, %rax + movq $0x14def9dea2f79cd6, %rdx + andq q, %rdx + movq $0x1000000000000000, %rcx + andq %rax, %rcx + +// Do compensating addition (iff subtraction borrowed) and store + + addq %rax, d0 + movq d0, (z) + adcq %rdx, d1 + movq d1, 8(z) + adcq $0, d2 + movq d2, 16(z) + adcq %rcx, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi + +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_p25519_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_p25519_4.S new file mode 100644 index 00000000000..2618031c6de --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mod_p25519_4.S @@ -0,0 +1,97 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_25519 +// Input x[4]; output z[4] +// +// extern void bignum_mod_p25519_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p25519_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p25519_4) + .text + +#define z %rdi +#define x %rsi + +#define d0 %rdx +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 +#define c %r10 + +#define q %rax + +#define qshort %eax +#define cshort %r10d + +S2N_BN_SYMBOL(bignum_mod_p25519_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the inputs as [d3;d2;d1;d0] + + movq (x), d0 + movq 8(x), d1 + movq 16(x), d2 + movq 24(x), d3 + +// Letting x = 2^255 * h + l where h is the top bit, the provisional quotient +// is q = h + 1, which is either correct or 1 too high. + + movl $1, qshort + xorl cshort, cshort + bts $63, d3 + adcq c, q + imulq $19, q + +// Writing the provisional remainder as r = x - (2^255 - 19) * q we +// compute r' = (2^255 + l) + 19 * q = r + 2^256 + + addq q, d0 + adcq c, d1 + adcq c, d2 + adcq c, d3 + +// Now r < 0 <=> r' < 2^256 <=> ~CF and in this case we correct by adding +// 2^255 - 19, or in fact subtracting 19 and masking to 255 bits. + + movl $19, qshort + cmovcq c, q + + subq q, d0 + sbbq c, d1 + sbbq c, d2 + sbbq c, d3 + btr $63, d3 + +// Store the end result + + movq d0, (z) + movq d1, 8(z) + movq d2, 16(z) + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mul_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mul_p25519.S new file mode 100644 index 00000000000..9ff1d5ac3dd --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mul_p25519.S @@ -0,0 +1,202 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply modulo p_25519, z := (x * y) mod p_25519 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_mul_p25519 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p25519) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// Copied in or set up + +#define y %rcx + +// A zero register + +#define zero %rbp +#define zeroe %ebp + +// mulpadd(high,low,m) adds %rdx * m to a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rbx as temporaries. + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rbx ; \ + adcxq %rax, low ; \ + adoxq %rbx, high + +// mulpade(high,low,m) adds %rdx * m to a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax as a temporary, assuming high created from scratch +// and that zero has value zero. + +#define mulpade(high,low,m) \ + mulxq m, %rax, high ; \ + adcxq %rax, low ; \ + adoxq zero, high + +S2N_BN_SYMBOL(bignum_mul_p25519): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Copy y into a safe register to start with + + movq %rdx, y + +// Zero a register, which also makes sure we don't get a fake carry-in + + xorl zeroe, zeroe + +// Do the zeroth row, which is a bit different + + movq (y), %rdx + + mulxq (x), %r8, %r9 + mulxq 8(x), %rax, %r10 + addq %rax, %r9 + mulxq 16(x), %rax, %r11 + adcq %rax, %r10 + mulxq 24(x), %rax, %r12 + adcq %rax, %r11 + adcq zero, %r12 + +// Add row 1 + + xorl zeroe, zeroe + movq 8(y), %rdx + mulpadd(%r10,%r9,(x)) + mulpadd(%r11,%r10,8(x)) + mulpadd(%r12,%r11,16(x)) + mulpade(%r13,%r12,24(x)) + adcq zero, %r13 + +// Add row 2 + + xorl zeroe, zeroe + movq 16(y), %rdx + mulpadd(%r11,%r10,(x)) + mulpadd(%r12,%r11,8(x)) + mulpadd(%r13,%r12,16(x)) + mulpade(%r14,%r13,24(x)); + adcq zero, %r14 + +// Add row 3; also use an early 38*r15+r11 to get a quotient estimate q +// and then squeeze in a 19 * q computation to inject into the next +// double-carry chain. At the end %rcx = q and %rax = 19 * q. + + xorl zeroe, zeroe + movq 24(y), %rdx + + mulpadd(%r12,%r11,(x)) + + mulxq 24(x), %rcx, %r15 + + mulpadd(%r13,%r12,8(x)) + mulpadd(%r14,%r13,16(x)) + + movl $38, %edx + mulxq %r15, %rax, %rbx + + adcxq %rcx, %r14 + adoxq zero, %r15 + adcq zero, %r15 + + addq %r11, %rax + adcq zero, %rbx + btq $63, %rax + adcq %rbx, %rbx + leaq 1(%rbx), %rcx + imulq $19, %rcx + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8] +// and this is == 38 * h + l (mod p_25519) +// We add in the precalculated 19 * q as well. +// This is kept in 4 words since we have enough information there. + + xorl zeroe, zeroe + adoxq %rcx, %r8 + mulpadd(%r9,%r8,%r12) + mulpadd(%r10,%r9,%r13) + mulpadd(%r11,%r10,%r14) + mulxq %r15, %rax, %rbx + adcq %rax, %r11 + +// We still haven't made the -2^255 * q contribution yet. Since we +// are now safely in 4 words we just need a single bit of q, and we +// can actually use the LSB of %rcx = 19 * q since 19 is odd. And we +// don't literally need to subtract, just to see whether we would +// have a top 1 bit if we did, meaning we need to correct in the +// last step by adding 2^255 - 19. + + shlq $63, %rcx + cmpq %rcx, %r11 + movl $19, %eax + cmovns zero, %rax + +// Now make that possible correction and finally mask to 255 bits + + subq %rax, %r8 + sbbq zero, %r9 + sbbq zero, %r10 + sbbq zero, %r11 + btr $63, %r11 + +// Write everything back + + movq %r8, (z) + movq %r9, 8(z) + movq %r10, 16(z) + movq %r11, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mul_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mul_p25519_alt.S new file mode 100644 index 00000000000..d339b0196d7 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_mul_p25519_alt.S @@ -0,0 +1,217 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply modulo p_25519, z := (x * y) mod p_25519 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_mul_p25519_alt +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p25519_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p25519_alt) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// Copied in or set up + +#define y %rcx + +// Re-use input pointers later for constant and top carry + +#define d %rsi +#define c %rcx + +#define dshort %esi + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A minutely shorter form for when c = 0 initially + +#define combadz(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq c, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h + +S2N_BN_SYMBOL(bignum_mul_p25519_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Copy y into a safe register to start with + + movq %rdx, y + +// Start the window as [%r10;%r9;%r8] with 00 product + + movq (x), %rax + mulq (y) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + +// Column 1 + + xorq %r11, %r11 + combads(%r10,%r9,(x),8(y)) + combadd(%r11,%r10,%r9,8(x),(y)) + +// Column 2 + + xorq %r12, %r12 + combadz(%r12,%r11,%r10,(x),16(y)) + combadd(%r12,%r11,%r10,8(x),8(y)) + combadd(%r12,%r11,%r10,16(x),(y)) + +// Column 3 + + xorq %r13, %r13 + combadz(%r13,%r12,%r11,(x),24(y)) + combadd(%r13,%r12,%r11,8(x),16(y)) + combadd(%r13,%r12,%r11,16(x),8(y)) + combadd(%r13,%r12,%r11,24(x),(y)) + +// Column 4 + + xorq %r14, %r14 + combadz(%r14,%r13,%r12,8(x),24(y)) + combadd(%r14,%r13,%r12,16(x),16(y)) + combadd(%r14,%r13,%r12,24(x),8(y)) + +// Column 5 + + xorq %r15, %r15 + combadz(%r15,%r14,%r13,16(x),24(y)) + combadd(%r15,%r14,%r13,24(x),16(y)) + +// Final work for columns 6 and 7 + + movq 24(x), %rax + mulq 24(y) + addq %rax, %r14 + adcq %rdx, %r15 + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8] +// and this is == 38 * h + l (mod p_25519) + + movl $38, dshort + + movq %r12, %rax + mulq d + addq %rax, %r8 + adcq %rdx, %r9 + sbbq c, c + + movq %r13, %rax + mulq d + subq c, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq c, c + + movq %r14, %rax + mulq d + subq c, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq c, c + + movq %r15, %rax + mulq d + subq c, %rdx + xorq c, c + addq %rax, %r11 + movq %rdx, %r12 + adcq c, %r12 + +// Now we have reduced to 5 digits, 2^255 * h + l = [%r12,%r11,%r10,%r9,%r8] +// Use q = h + 1 as the initial quotient estimate, either right or 1 too big. + + shldq $1, %r11, %r12 + leaq 1(%r12), %rax + movl $19, dshort + bts $63, %r11 + imulq d, %rax + addq %rax, %r8 + adcq c, %r9 + adcq c, %r10 + adcq c, %r11 + +// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0] +// So we correct if CF = 0 by subtracting 19, either way masking to +// 255 bits, i.e. by effectively adding p_25519 to the "full" answer + + sbbq %rax, %rax + notq %rax + andq d, %rax + subq %rax, %r8 + sbbq c, %r9 + sbbq c, %r10 + sbbq c, %r11 + btr $63, %r11 + +// Write everything back + + movq %r8, (z) + movq %r9, 8(z) + movq %r10, 16(z) + movq %r11, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/curve25519/bignum_neg_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_neg_p25519.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/bignum_neg_p25519.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_neg_p25519.S index 5e66073baf6..7b9408f0e8f 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/bignum_neg_p25519.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_neg_p25519.S @@ -33,6 +33,7 @@ #define qshort %esi S2N_BN_SYMBOL(bignum_neg_p25519): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_optneg_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_optneg_p25519.S new file mode 100644 index 00000000000..61ff47cd297 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_optneg_p25519.S @@ -0,0 +1,98 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally negate modulo p_25519, z := (-x) mod p_25519 (if p nonzero) or +// z := x (if p zero), assuming x reduced +// Inputs p, x[4]; output z[4] +// +// extern void bignum_optneg_p25519 +// (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = p, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = p, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_p25519) + .text + +#define z %rdi +#define q %rsi +#define x %rdx + +#define n0 %rax +#define n1 %rcx +#define n2 %r8 +#define n3 %r9 + +S2N_BN_SYMBOL(bignum_optneg_p25519): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Adjust q by zeroing it if the input is zero (to avoid giving -0 = p_25519, +// which is not strictly reduced even though it's correct modulo p_25519). + + movq (x), n0 + orq 8(x), n0 + movq 16(x), n1 + orq 24(x), n1 + orq n1, n0 + negq n0 + sbbq n0, n0 + andq n0, q + +// Turn q into a bitmask, all 1s for q=false, all 0s for q=true + + negq q + sbbq q, q + notq q + +// Let [n3;n2;n1;n0] = if q then p_25519 else -1 + + movq $0xffffffffffffffed, n0 + orq q, n0 + movq $0xffffffffffffffff, n1 + movq n1, n2 + movq $0x7fffffffffffffff, n3 + orq q, n3 + +// Subtract so [n3;n2;n1;n0] = if q then p_25519 - x else -1 - x + + subq (x), n0 + sbbq 8(x), n1 + sbbq 16(x), n2 + sbbq 24(x), n3 + +// XOR the words with the bitmask, which in the case q = false has the +// effect of restoring ~(-1 - x) = -(-1 - x) - 1 = 1 + x - 1 = x +// and write back the digits to the output + + xorq q, n0 + movq n0, (z) + xorq q, n1 + movq n1, 8(z) + xorq q, n2 + movq n2, 16(z) + xorq q, n3 + movq n3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqr_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqr_p25519.S new file mode 100644 index 00000000000..45a6890dd39 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqr_p25519.S @@ -0,0 +1,186 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square modulo p_25519, z := (x^2) mod p_25519 +// Input x[4]; output z[4] +// +// extern void bignum_sqr_p25519 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p25519) + .text + +#define z %rdi +#define x %rsi + +// Use this fairly consistently for a zero + +#define zero %rbx +#define zeroe %ebx + +// Add %rdx * m into a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rcx as temporaries + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rcx ; \ + adcxq %rax, low ; \ + adoxq %rcx, high + +// mulpade(high,low,m) adds %rdx * m to a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax as a temporary, assuming high created from scratch +// and that zero has value zero. + +#define mulpade(high,low,m) \ + mulxq m, %rax, high ; \ + adcxq %rax, low ; \ + adoxq zero, high + +S2N_BN_SYMBOL(bignum_sqr_p25519): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Compute [%r15;%r8] = [00] which we use later, but mainly +// set up an initial window [%r14;...;%r9] = [23;03;01] + + movq (x), %rdx + mulxq %rdx, %r8, %r15 + mulxq 8(x), %r9, %r10 + mulxq 24(x), %r11, %r12 + movq 16(x), %rdx + mulxq 24(x), %r13, %r14 + +// Clear our zero register, and also initialize the flags for the carry chain + + xorl zeroe, zeroe + +// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible) +// This gives all the "heterogeneous" terms of the squaring ready to double + + mulpadd(%r11,%r10,(x)) + mulpadd(%r12,%r11,8(x)) + movq 24(x), %rdx + mulpadd(%r13,%r12,8(x)) + adcxq zero, %r13 + adoxq zero, %r14 + adcq zero, %r14 + +// Double and add to the 00 + 11 + 22 + 33 terms, while also +// pre-estimating the quotient from early results. + + xorl zeroe, zeroe + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 8(x), %rdx + mulxq %rdx, %rax, %rcx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rcx, %r11 + movq 16(x), %rdx + mulxq %rdx, %rax, %rcx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rcx, %r13 + movq 24(x), %rdx + mulxq %rdx, %rax, %r15 + + movl $38, %edx + mulxq %r15, %rdx, %rcx + + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq zero, %r15 + adoxq zero, %r15 + + addq %r11, %rdx + adcq zero, %rcx + shldq $1, %rdx, %rcx + leaq 1(%rcx), %rbx + imulq $19, %rbx + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8] +// and this is == 38 * h + l (mod p_25519) +// We add in the precalculated 19 * q as well. +// This is kept in 4 words since we have enough information there. + + xorl %eax, %eax + adoxq %rbx, %r8 + movl $38, %edx + mulpadd(%r9,%r8,%r12) + mulpadd(%r10,%r9,%r13) + mulpadd(%r11,%r10,%r14) + mulxq %r15, %rax, %rcx + adcq %rax, %r11 + +// We still haven't made the -2^255 * q contribution yet. Since we +// are now safely in 4 words we just need a single bit of q, and we +// can actually use the LSB of %rcx = 19 * q since 19 is odd. And we +// don't literally need to subtract, just to see whether we would +// have a top 1 bit if we did, meaning we need to correct in the +// last step by adding 2^255 - 19. + + xorl %ecx, %ecx + shlq $63, %rbx + cmpq %rbx, %r11 + movl $19, %eax + cmovns %rcx, %rax + +// Now make that possible correction and finally mask to 255 bits + + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rcx, %r11 + btr $63, %r11 + +// Write everything back + + movq %r8, (z) + movq %r9, 8(z) + movq %r10, 16(z) + movq %r11, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqr_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqr_p25519_alt.S new file mode 100644 index 00000000000..17bef47a1a8 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqr_p25519_alt.S @@ -0,0 +1,201 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square modulo p_25519, z := (x^2) mod p_25519 +// Input x[4]; output z[4] +// +// extern void bignum_sqr_p25519_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p25519_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p25519_alt) + .text + +#define z %rdi +#define x %rsi + +// Re-use input pointer later for constant + +#define d %rsi +#define c %rcx + +#define dshort %esi + +// Macro for the key "multiply and add to (c,h,l)" step, for square term + +#define combadd1(c,h,l,numa) \ + movq numa, %rax ; \ + mulq %rax; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa) \ + movq numa, %rax ; \ + mulq %rax; \ + addq %rax, l ; \ + adcq %rdx, h + +// A version doubling before adding, for non-square terms + +#define combadd2(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0, c ; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +S2N_BN_SYMBOL(bignum_sqr_p25519_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Result term 0 + + movq (x), %rax + mulq %rax + + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + +// Result term 1 + + xorq %r11, %r11 + combadd2(%r11,%r10,%r9,(x),8(x)) + +// Result term 2 + + xorq %r12, %r12 + combadd1(%r12,%r11,%r10,8(x)) + combadd2(%r12,%r11,%r10,(x),16(x)) + +// Result term 3 + + xorq %r13, %r13 + combadd2(%r13,%r12,%r11,(x),24(x)) + combadd2(%r13,%r12,%r11,8(x),16(x)) + +// Result term 4 + + xorq %r14, %r14 + combadd2(%r14,%r13,%r12,8(x),24(x)) + combadd1(%r14,%r13,%r12,16(x)) + +// Result term 5 + + xorq %r15, %r15 + combadd2(%r15,%r14,%r13,16(x),24(x)) + +// Result term 6 + + combads(%r15,%r14,24(x)) + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8] +// and this is == 38 * h + l (mod p_25519) + + movl $38, dshort + + movq %r12, %rax + mulq d + addq %rax, %r8 + adcq %rdx, %r9 + sbbq c, c + + movq %r13, %rax + mulq d + subq c, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq c, c + + movq %r14, %rax + mulq d + subq c, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq c, c + + movq %r15, %rax + mulq d + subq c, %rdx + xorq c, c + addq %rax, %r11 + movq %rdx, %r12 + adcq c, %r12 + +// Now we have reduced to 5 digits, 2^255 * h + l = [%r12,%r11,%r10,%r9,%r8] +// Use q = h + 1 as the initial quotient estimate, either right or 1 too big. + + shldq $1, %r11, %r12 + leaq 1(%r12), %rax + movl $19, dshort + bts $63, %r11 + imulq d, %rax + addq %rax, %r8 + adcq c, %r9 + adcq c, %r10 + adcq c, %r11 + +// Now the effective answer is 2^256 * (CF - 1) + [u3,u2,u1,u0] +// So we correct if CF = 0 by subtracting 19, either way masking to +// 255 bits, i.e. by effectively adding p_25519 to the "full" answer + + sbbq %rax, %rax + notq %rax + andq d, %rax + subq %rax, %r8 + sbbq c, %r9 + sbbq c, %r10 + sbbq c, %r11 + btr $63, %r11 + +// Write everything back + + movq %r8, (z) + movq %r9, 8(z) + movq %r10, 16(z) + movq %r11, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqrt_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqrt_p25519.S new file mode 100644 index 00000000000..7762cf69a45 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqrt_p25519.S @@ -0,0 +1,595 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square root modulo p_25519 = 2^255 - 19 +// Input x[4]; output function return (Legendre symbol) and z[4] +// +// extern int64_t bignum_sqrt_p25519(uint64_t z[static 4],uint64_t x[static 4]); +// +// Given a 4-digit input x, returns a modular square root mod p_25519, i.e. +// a z such that z^2 == x (mod p_25519), whenever one exists. The square +// root z is chosen so that its LSB is even (note that p_25519 - z is +// another square root). The function return is the Legendre/Jacobi symbol +// (x//p_25519), which indicates whether indeed x has a modular square root +// and hence whether the result is meaningful: +// +// 0: x is divisible by p_25519 and z is the square root 0 +// +1: x is coprime to p_25519 and z is a square root +// -1: x is coprime to p_25519 but not a quadratic residue +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqrt_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqrt_p25519) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define a 0(%rsp) +#define b (4*N)(%rsp) +#define s (8*N)(%rsp) +#define t (12*N)(%rsp) +#define u (16*N)(%rsp) +#define res (20*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (22*N) + +// Corrupted versions when stack is down 8 more + +#define u8 (17*N)(%rsp) + +// Syntactic variants to make x86_att version simpler to generate + +#define A 0 +#define B (4*N) +#define S (8*N) +#define T (12*N) +#define U (16*N) +#define U8 (17*N) + +S2N_BN_SYMBOL(bignum_sqrt_p25519): + _CET_ENDBR + +// In this case the Windows form literally makes a subroutine call. +// This avoids hassle arising from subroutine offsets + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + callq bignum_sqrt_p25519_standard + popq %rsi + popq %rdi + ret + +bignum_sqrt_p25519_standard: +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Set up reduced version of the input argument a = x mod p_25519. Then +// get the candidate square root s = a^{252-2} + + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 + movl $0x1, %eax + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, A(%rsp) + movq %rcx, A+0x8(%rsp) + movq %r8, A+0x10(%rsp) + movq %r9, A+0x18(%rsp) + + // Power 2^2 - 1 = 3 + + leaq T(%rsp), %rdi + movq $1, %rsi + leaq A(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq T(%rsp), %rsi + leaq A(%rsp), %rdx + callq bignum_sqrt_p25519_mul_p25519 + + // Power 2^4 - 1 = 15 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_mul_p25519 + + // Power 2^5 - 1 = 31 + + leaq S(%rsp), %rdi + movq $1, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq A(%rsp), %rdx + callq bignum_sqrt_p25519_mul_p25519 + + // Power 2^10 - 1 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_mul_p25519 + + // Power 2^20 - 1 + + leaq S(%rsp), %rdi + movq $10, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_mul_p25519 + + // Power 2^25 - 1 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_mul_p25519 + + // Power 2^50 - 1 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_mul_p25519 + + // Power 2^100 - 1 + + leaq S(%rsp), %rdi + movq $50, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_mul_p25519 + + // Power 2^125 - 1 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_mul_p25519 + + // Power 2^250 - 1 + + leaq S(%rsp), %rdi + movq $125, %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_mul_p25519 + + // Power 2^251 - 1 + + leaq S(%rsp), %rdi + movq $1, %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq A(%rsp), %rdx + callq bignum_sqrt_p25519_mul_p25519 + + // Power 2^252 - 2 + + leaq S(%rsp), %rdi + movq $1, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + +// s is now one candidate square root. Generate the other one t = s * j_25519 + + movq $0xc4ee1b274a0ea0b0, %rax + movq %rax, T(%rsp) + movq $0x2f431806ad2fe478, %rax + movq %rax, T+8(%rsp) + movq $0x2b4d00993dfbd7a7, %rax + movq %rax, T+16(%rsp) + movq $0x2b8324804fc1df0b, %rax + movq %rax, T+24(%rsp) + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_mul_p25519 + +// Now multiplex between them according to whether s^2 = a + + leaq B(%rsp), %rdi + movq $1, %rsi + leaq S(%rsp), %rdx + callq bignum_sqrt_p25519_nsqr_p25519 + + movq A(%rsp), %rax + xorq B(%rsp), %rax + movq A+8(%rsp), %rbx + xorq B+8(%rsp), %rbx + orq %rbx, %rax + movq A+16(%rsp), %rcx + xorq B+16(%rsp), %rcx + movq A+24(%rsp), %rdx + xorq B+24(%rsp), %rdx + orq %rdx, %rcx + orq %rcx, %rax + + movq S(%rsp), %rax + movq T(%rsp), %rbx + cmovnzq %rbx, %rax + movq S+8(%rsp), %rbx + movq T+8(%rsp), %rcx + cmovnzq %rcx, %rbx + movq S+16(%rsp), %rcx + movq T+16(%rsp), %rdx + cmovnzq %rdx, %rcx + movq S+24(%rsp), %rbp + movq T+24(%rsp), %rdx + cmovnzq %rdx, %rbp + +// For definiteness, choose "positive" (LSB=0) square root + + xorl %edx, %edx + leaq -19(%rdx), %r8 + leaq -1(%rdx), %r11 + movq %r11, %r9 + movq %r11, %r10 + btr $63, %r11 + + subq %rax, %r8 + sbbq %rbx, %r9 + sbbq %rcx, %r10 + sbbq %rbp, %r11 + + movq res, %rdx + testq $1, %rax + cmovnzq %r8, %rax + movq %rax, (%rdx) + cmovnzq %r9, %rbx + movq %rbx, 8(%rdx) + cmovnzq %r10, %rcx + movq %rcx, 16(%rdx) + cmovnzq %r11, %rbp + movq %rbp, 24(%rdx) + +// Determine if it is is indeed a square root and also if a = 0 +// Hence return the Legendre-Jacobi symbol as required. + + leaq B(%rsp), %rdi + movq $1, %rsi + callq bignum_sqrt_p25519_nsqr_p25519 + + movq A(%rsp), %rax + movq %rax, %rbp + xorq B(%rsp), %rax + movq A+8(%rsp), %rbx + orq %rbx, %rbp + xorq B+8(%rsp), %rbx + orq %rbx, %rax + movq A+16(%rsp), %rcx + orq %rcx, %rbp + xorq B+16(%rsp), %rcx + movq A+24(%rsp), %rdx + orq %rdx, %rbp + xorq B+24(%rsp), %rdx + orq %rdx, %rcx + orq %rcx, %rax + negq %rax + sbbq %rax, %rax + leaq 1(%rax,%rax,1), %rax + + testq %rbp, %rbp + cmovzq %rbp, %rax + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +bignum_sqrt_p25519_mul_p25519: + movq %rdx, %rcx + xorl %ebp, %ebp + movq (%rcx), %rdx + mulxq (%rsi), %r8, %r9 + mulxq 0x8(%rsi), %rax, %r10 + addq %rax, %r9 + mulxq 0x10(%rsi), %rax, %r11 + adcq %rax, %r10 + mulxq 0x18(%rsi), %rax, %r12 + adcq %rax, %r11 + adcq %rbp, %r12 + xorl %ebp, %ebp + movq 0x8(%rcx), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + adcq %rbp, %r13 + xorl %ebp, %ebp + movq 0x10(%rcx), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + movq 0x18(%rcx), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rcx, %r15 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movl $0x26, %edx + mulxq %r15, %rax, %rbx + adcxq %rcx, %r14 + adoxq %rbp, %r15 + adcq %rbp, %r15 + addq %r11, %rax + adcq %rbp, %rbx + btq $0x3f, %rax + adcq %rbx, %rbx + leaq 0x1(%rbx), %rcx + imulq $0x13, %rcx, %rcx + xorl %ebp, %ebp + adoxq %rcx, %r8 + mulxq %r12, %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq %r13, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r14, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq %r15, %rax, %rbx + adcq %rax, %r11 + shlq $0x3f, %rcx + cmpq %rcx, %r11 + movl $0x13, %eax + cmovns %rbp, %rax + subq %rax, %r8 + sbbq %rbp, %r9 + sbbq %rbp, %r10 + sbbq %rbp, %r11 + btr $0x3f, %r11 + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +bignum_sqrt_p25519_nsqr_p25519: + +// Copy input argument into u + + movq (%rdx), %rax + movq 8(%rdx), %rbx + movq 16(%rdx), %rcx + movq 24(%rdx), %rdx + movq %rax, U8(%rsp) + movq %rbx, U8+8(%rsp) + movq %rcx, U8+16(%rsp) + movq %rdx, U8+24(%rsp) + +// Main squaring loop, accumulating in u consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +bignum_sqrt_p25519_loop: + movq U8(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq U8+0x8(%rsp), %r9, %r10 + mulxq U8+0x18(%rsp), %r11, %r12 + movq U8+0x10(%rsp), %rdx + mulxq U8+0x18(%rsp), %r13, %r14 + xorl %ebx, %ebx + mulxq U8(%rsp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq U8+0x8(%rsp), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + movq U8+0x18(%rsp), %rdx + mulxq U8+0x8(%rsp), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rbx, %r13 + adoxq %rbx, %r14 + adcq %rbx, %r14 + xorl %ebx, %ebx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq U8+0x8(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq U8+0x10(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq U8+0x18(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbx, %r15 + adoxq %rbx, %r15 + movl $0x26, %edx + xorl %ebx, %ebx + mulxq %r12, %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq %r13, %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq %r14, %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq %r15, %rax, %r12 + adcxq %rax, %r11 + adoxq %rbx, %r12 + adcxq %rbx, %r12 + shldq $0x1, %r11, %r12 + btr $0x3f, %r11 + movl $0x13, %edx + imulq %r12, %rdx + addq %rdx, %r8 + adcq %rbx, %r9 + adcq %rbx, %r10 + adcq %rbx, %r11 + movq %r8, U8(%rsp) + movq %r9, U8+0x8(%rsp) + movq %r10, U8+0x10(%rsp) + movq %r11, U8+0x18(%rsp) + +// Loop as applicable + + decq %rsi + jnz bignum_sqrt_p25519_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "ns" condition. We just use the results where +// they were in registers [%r11;%r10;%r9;%r8] instead of re-loading them. + + movl $19, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovns %r8, %rax + cmovns %r9, %rbx + cmovns %r10, %rcx + cmovns %r11, %rdx + btr $63, %rdx + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %rcx, 16(%rdi) + movq %rdx, 24(%rdi) + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqrt_p25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqrt_p25519_alt.S new file mode 100644 index 00000000000..6721bc8b283 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sqrt_p25519_alt.S @@ -0,0 +1,676 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square root modulo p_25519 = 2^255 - 19 +// Input x[4]; output function return (Legendre symbol) and z[4] +// +// extern int64_t bignum_sqrt_p25519_alt(uint64_t z[static 4],uint64_t x[static 4]); +// +// Given a 4-digit input x, returns a modular square root mod p_25519, i.e. +// a z such that z^2 == x (mod p_25519), whenever one exists. The square +// root z is chosen so that its LSB is even (note that p_25519 - z is +// another square root). The function return is the Legendre/Jacobi symbol +// (x//p_25519), which indicates whether indeed x has a modular square root +// and hence whether the result is meaningful: +// +// 0: x is divisible by p_25519 and z is the square root 0 +// +1: x is coprime to p_25519 and z is a square root +// -1: x is coprime to p_25519 but not a quadratic residue +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqrt_p25519_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqrt_p25519_alt) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define a 0(%rsp) +#define b (4*N)(%rsp) +#define s (8*N)(%rsp) +#define t (12*N)(%rsp) +#define u (16*N)(%rsp) +#define res (20*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (22*N) + +// Corrupted versions when stack is down 8 more + +#define u8 (17*N)(%rsp) + +// Syntactic variants to make x86_att version simpler to generate + +#define A 0 +#define B (4*N) +#define S (8*N) +#define T (12*N) +#define U (16*N) +#define U8 (17*N) + +S2N_BN_SYMBOL(bignum_sqrt_p25519_alt): + _CET_ENDBR + +// In this case the Windows form literally makes a subroutine call. +// This avoids hassle arising from subroutine offsets + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + callq bignum_sqrt_p25519_alt_standard + popq %rsi + popq %rdi + ret + +bignum_sqrt_p25519_alt_standard: +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Set up reduced version of the input argument a = x mod p_25519. Then +// get the candidate square root s = a^{252-2} + + movq (%rsi), %rdx + movq 0x8(%rsi), %rcx + movq 0x10(%rsi), %r8 + movq 0x18(%rsi), %r9 + movl $0x1, %eax + xorl %r10d, %r10d + bts $0x3f, %r9 + adcq %r10, %rax + imulq $0x13, %rax, %rax + addq %rax, %rdx + adcq %r10, %rcx + adcq %r10, %r8 + adcq %r10, %r9 + movl $0x13, %eax + cmovbq %r10, %rax + subq %rax, %rdx + sbbq %r10, %rcx + sbbq %r10, %r8 + sbbq %r10, %r9 + btr $0x3f, %r9 + movq %rdx, A(%rsp) + movq %rcx, A+0x8(%rsp) + movq %r8, A+0x10(%rsp) + movq %r9, A+0x18(%rsp) + + // Power 2^2 - 1 = 3 + + leaq T(%rsp), %rdi + movq $1, %rsi + leaq A(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq T(%rsp), %rsi + leaq A(%rsp), %rdx + callq bignum_sqrt_p25519_alt_mul_p25519 + + // Power 2^4 - 1 = 15 + + leaq S(%rsp), %rdi + movq $2, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_alt_mul_p25519 + + // Power 2^5 - 1 = 31 + + leaq S(%rsp), %rdi + movq $1, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq A(%rsp), %rdx + callq bignum_sqrt_p25519_alt_mul_p25519 + + // Power 2^10 - 1 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_alt_mul_p25519 + + // Power 2^20 - 1 + + leaq S(%rsp), %rdi + movq $10, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_alt_mul_p25519 + + // Power 2^25 - 1 + + leaq S(%rsp), %rdi + movq $5, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_alt_mul_p25519 + + // Power 2^50 - 1 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_alt_mul_p25519 + + // Power 2^100 - 1 + + leaq S(%rsp), %rdi + movq $50, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_alt_mul_p25519 + + // Power 2^125 - 1 + + leaq S(%rsp), %rdi + movq $25, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_alt_mul_p25519 + + // Power 2^250 - 1 + + leaq S(%rsp), %rdi + movq $125, %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + leaq B(%rsp), %rdi + leaq S(%rsp), %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_alt_mul_p25519 + + // Power 2^251 - 1 + + leaq S(%rsp), %rdi + movq $1, %rsi + leaq B(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq A(%rsp), %rdx + callq bignum_sqrt_p25519_alt_mul_p25519 + + // Power 2^252 - 2 + + leaq S(%rsp), %rdi + movq $1, %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + +// s is now one candidate square root. Generate the other one t = s * j_25519 + + movq $0xc4ee1b274a0ea0b0, %rax + movq %rax, T(%rsp) + movq $0x2f431806ad2fe478, %rax + movq %rax, T+8(%rsp) + movq $0x2b4d00993dfbd7a7, %rax + movq %rax, T+16(%rsp) + movq $0x2b8324804fc1df0b, %rax + movq %rax, T+24(%rsp) + + leaq T(%rsp), %rdi + leaq S(%rsp), %rsi + leaq T(%rsp), %rdx + callq bignum_sqrt_p25519_alt_mul_p25519 + +// Now multiplex between them according to whether s^2 = a + + leaq B(%rsp), %rdi + movq $1, %rsi + leaq S(%rsp), %rdx + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + movq A(%rsp), %rax + xorq B(%rsp), %rax + movq A+8(%rsp), %rbx + xorq B+8(%rsp), %rbx + orq %rbx, %rax + movq A+16(%rsp), %rcx + xorq B+16(%rsp), %rcx + movq A+24(%rsp), %rdx + xorq B+24(%rsp), %rdx + orq %rdx, %rcx + orq %rcx, %rax + + movq S(%rsp), %rax + movq T(%rsp), %rbx + cmovnzq %rbx, %rax + movq S+8(%rsp), %rbx + movq T+8(%rsp), %rcx + cmovnzq %rcx, %rbx + movq S+16(%rsp), %rcx + movq T+16(%rsp), %rdx + cmovnzq %rdx, %rcx + movq S+24(%rsp), %rbp + movq T+24(%rsp), %rdx + cmovnzq %rdx, %rbp + +// For definiteness, choose "positive" (LSB=0) square root + + xorl %edx, %edx + leaq -19(%rdx), %r8 + leaq -1(%rdx), %r11 + movq %r11, %r9 + movq %r11, %r10 + btr $63, %r11 + + subq %rax, %r8 + sbbq %rbx, %r9 + sbbq %rcx, %r10 + sbbq %rbp, %r11 + + movq res, %rdx + testq $1, %rax + cmovnzq %r8, %rax + movq %rax, (%rdx) + cmovnzq %r9, %rbx + movq %rbx, 8(%rdx) + cmovnzq %r10, %rcx + movq %rcx, 16(%rdx) + cmovnzq %r11, %rbp + movq %rbp, 24(%rdx) + +// Determine if it is is indeed a square root and also if a = 0 +// Hence return the Legendre-Jacobi symbol as required. + + leaq B(%rsp), %rdi + movq $1, %rsi + callq bignum_sqrt_p25519_alt_nsqr_p25519 + + movq A(%rsp), %rax + movq %rax, %rbp + xorq B(%rsp), %rax + movq A+8(%rsp), %rbx + orq %rbx, %rbp + xorq B+8(%rsp), %rbx + orq %rbx, %rax + movq A+16(%rsp), %rcx + orq %rcx, %rbp + xorq B+16(%rsp), %rcx + movq A+24(%rsp), %rdx + orq %rdx, %rbp + xorq B+24(%rsp), %rdx + orq %rdx, %rcx + orq %rcx, %rax + negq %rax + sbbq %rax, %rax + leaq 1(%rax,%rax,1), %rax + + testq %rbp, %rbp + cmovzq %rbp, %rax + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +// ************************************************************* +// Local z = x * y +// ************************************************************* + +bignum_sqrt_p25519_alt_mul_p25519: + movq %rdx, %rcx + movq (%rsi), %rax + mulq (%rcx) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq (%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x8(%rsi), %rax + mulq (%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + xorq %r12, %r12 + movq (%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x8(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x10(%rsi), %rax + mulq (%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq (%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x8(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x10(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x18(%rsi), %rax + mulq (%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x8(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x18(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x10(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x18(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + movl $0x26, %esi + movq %r12, %rax + mulq %rsi + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rcx, %rcx + movq %r13, %rax + mulq %rsi + subq %rcx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r14, %rax + mulq %rsi + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq %r15, %rax + mulq %rsi + subq %rcx, %rdx + xorq %rcx, %rcx + addq %rax, %r11 + movq %rdx, %r12 + adcq %rcx, %r12 + shldq $0x1, %r11, %r12 + leaq 0x1(%r12), %rax + movl $0x13, %esi + bts $0x3f, %r11 + imulq %rsi, %rax + addq %rax, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rcx, %r11 + sbbq %rax, %rax + notq %rax + andq %rsi, %rax + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rcx, %r10 + sbbq %rcx, %r11 + btr $0x3f, %r11 + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + ret + +// ************************************************************* +// Local z = 2^n * x +// ************************************************************* + +bignum_sqrt_p25519_alt_nsqr_p25519: + +// Copy input argument into u + + movq (%rdx), %rax + movq 8(%rdx), %rbx + movq 16(%rdx), %rcx + movq 24(%rdx), %rdx + movq %rax, U8(%rsp) + movq %rbx, U8+8(%rsp) + movq %rcx, U8+16(%rsp) + movq %rdx, U8+24(%rsp) + +// Main squaring loop, accumulating in u consistently and +// only ensuring the intermediates are < 2 * p_25519 = 2^256 - 38 + +bignum_sqrt_p25519_alt_loop: + movq U8(%rsp), %rax + mulq %rax + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq U8(%rsp), %rax + mulq U8+0x8(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + xorq %r12, %r12 + movq U8+0x8(%rsp), %rax + mulq %rax + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq U8(%rsp), %rax + mulq U8+0x10(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq U8(%rsp), %rax + mulq U8+0x18(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq U8+0x8(%rsp), %rax + mulq U8+0x10(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r13 + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq U8+0x8(%rsp), %rax + mulq U8+0x18(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r14 + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq U8+0x10(%rsp), %rax + mulq %rax + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq U8+0x10(%rsp), %rax + mulq U8+0x18(%rsp) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r15 + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq U8+0x18(%rsp), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + movl $0x26, %ebx + movq %r12, %rax + mulq %rbx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rcx, %rcx + movq %r13, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r14, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq %r15, %rax + mulq %rbx + subq %rcx, %rdx + xorq %rcx, %rcx + addq %rax, %r11 + movq %rdx, %r12 + adcq %rcx, %r12 + shldq $0x1, %r11, %r12 + btr $0x3f, %r11 + movl $0x13, %edx + imulq %r12, %rdx + addq %rdx, %r8 + adcq %rcx, %r9 + adcq %rcx, %r10 + adcq %rcx, %r11 + movq %r8, U8(%rsp) + movq %r9, U8+0x8(%rsp) + movq %r10, U8+0x10(%rsp) + movq %r11, U8+0x18(%rsp) + +// Loop as applicable + + decq %rsi + jnz bignum_sqrt_p25519_alt_loop + +// We know the intermediate result x < 2^256 - 38, and now we do strict +// modular reduction mod 2^255 - 19. Note x < 2^255 - 19 <=> x + 19 < 2^255 +// which is equivalent to a "ns" condition. We just use the results where +// they were in registers [%r11;%r10;%r9;%r8] instead of re-loading them. + + movl $19, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovns %r8, %rax + cmovns %r9, %rbx + cmovns %r10, %rcx + cmovns %r11, %rdx + btr $63, %rdx + movq %rax, (%rdi) + movq %rbx, 8(%rdi) + movq %rcx, 16(%rdi) + movq %rdx, 24(%rdi) + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sub_p25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sub_p25519.S new file mode 100644 index 00000000000..ecfd00a930d --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/bignum_sub_p25519.S @@ -0,0 +1,85 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Subtract modulo p_25519, z := (x - y) mod p_25519 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_sub_p25519 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_p25519) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_p25519) + .text + +#define z %rdi +#define x %rsi +#define y %rdx + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +#define zero %rax +#define zeroe %eax +#define c %rcx +#define cshort %ecx + +S2N_BN_SYMBOL(bignum_sub_p25519): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Load and subtract the two inputs as [d3;d2;d1;d0] = x - y (modulo 2^256) + + movq (x), d0 + subq (y), d0 + movq 8(x), d1 + sbbq 8(y), d1 + movq 16(x), d2 + sbbq 16(y), d2 + movq 24(x), d3 + sbbq 24(y), d3 + +// Now if x < y we want to add back p_25519, which staying within 4 digits +// means subtracting 19, since p_25519 = 2^255 - 19. +// Let c be that constant 19 when x < y, zero otherwise. + + sbbq c, c + xorl zeroe, zeroe + andq $19, c + +// Correct by adding the optional constant and masking to 255 bits + + subq c, d0 + movq d0, (z) + sbbq zero, d1 + movq d1, 8(z) + sbbq zero, d2 + movq d2, 16(z) + sbbq zero, d3 + btr $63, d3 + movq d3, 24(z) +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_ladderstep.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_ladderstep.S new file mode 100644 index 00000000000..cc05e86bdaa --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_ladderstep.S @@ -0,0 +1,743 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery ladder step on pairs of (X,Z)-projective curve25519 points +// +// extern void curve25519_ladderstep +// (uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b) +// +// If point = (X,1) and pp = (n * (X,1),[n+1] * (X,1)) then the output +// rr = (n' * (X,1),[n'+1] * (X,1)) where n' = 2 * n + b, with input +// b assumed to be 0 or 1; in this setting, each pair (X,Z) is assumed to +// be a projective y-free representation of an affine curve25519 point +// (X/Z,y), with the initial "differential" point having Z = 1 and X its +// affine x coordinate. In other words, the ladderstep operation is a +// combination of doubling, differential addition and optional swapping. +// +// Standard x86-64 ABI: RDI = rr, RSI = point, RDX = pp, RCX = b +// Microsoft x64 ABI: RCX = rr, RDX = point, R8 = pp, R9 = b +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_ladderstep) + S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_ladderstep) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// The single field of the input point used (z assumed 1) + +#define point_x 0(%rbp) + +// Pointer-offset pairs for pp fields +// These use the initial register %rdx as the offset. +// We then never need it again so it can be ephemeral + +#define xn 0(%rdx) +#define zn NUMSIZE(%rdx) +#define xm (2*NUMSIZE)(%rdx) +#define zm (3*NUMSIZE)(%rdx) + +// Result fields + +#define res0 0(%rbp) +#define res1 NUMSIZE(%rbp) +#define res2 (2*NUMSIZE)(%rbp) +#define res3 (3*NUMSIZE)(%rbp) + +// Pointer-offset pairs for temporaries on stack +// dmsn and dnsm need space for >= 5 digits, and we allocate 8 + +#define sm (0*NUMSIZE)(%rsp) +#define sn (1*NUMSIZE)(%rsp) +#define dm (2*NUMSIZE)(%rsp) +#define dn (3*NUMSIZE)(%rsp) +#define dmsn (4*NUMSIZE)(%rsp) +#define dnsm (6*NUMSIZE)(%rsp) +#define s (8*NUMSIZE)(%rsp) +#define d (9*NUMSIZE)(%rsp) +#define p (10*NUMSIZE)(%rsp) + +// Preserved inputs + +#define rr (12*NUMSIZE)(%rsp) +#define point (12*NUMSIZE)+8(%rsp) +#define pp (12*NUMSIZE)+16(%rsp) +#define bb (12*NUMSIZE)+24(%rsp) + +// More, but aliases to above + +#define sumx sm +#define sumz sn +#define dubx dm +#define dubz dn +#define e dubz +#define spro dnsm +#define dpro sumz + +// Total size to reserve on the stack + +#define NSPACE (13*NUMSIZE) + +// Macros wrapping up the basic field operation calls +// bignum_mul_p25519 and bignum_sqr_p25519. +// These two are only trivially different from pure +// function calls to those subroutines. + +#define mul_p25519(P0,P1,P2) \ + xorl %edi, %edi ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rdi, %r12 ; \ + xorl %edi, %edi ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rdi, %r13 ; \ + adcxq %rdi, %r13 ; \ + xorl %edi, %edi ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rdi, %r14 ; \ + adcxq %rdi, %r14 ; \ + xorl %edi, %edi ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rdi, %r15 ; \ + adcxq %rdi, %r15 ; \ + movl $0x26, %edx ; \ + xorl %edi, %edi ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rdi, %r12 ; \ + adcxq %rdi, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + movl $0x13, %edx ; \ + incq %r12; \ + bts $63, %r11 ; \ + mulxq %r12, %rax, %rbx ; \ + addq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rdi, %r10 ; \ + adcq %rdi, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rdx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rdi, %r9 ; \ + sbbq %rdi, %r10 ; \ + sbbq %rdi, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +#define sqr_p25519(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ebx, %ebx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rbx, %r13 ; \ + adoxq %rbx, %r14 ; \ + adcq %rbx, %r14 ; \ + xorl %ebx, %ebx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rbx, %r15 ; \ + adoxq %rbx, %r15 ; \ + movl $0x26, %edx ; \ + xorl %ebx, %ebx ; \ + mulxq %r12, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq %r13, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq %r14, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + adcxq %rbx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + movl $0x13, %edx ; \ + leaq 0x1(%r12), %rax ; \ + bts $0x3f, %r11 ; \ + imulq %rdx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + cmovbq %rbx, %rdx ; \ + subq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %r11 ; \ + btr $0x3f, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Multiplication just giving a 5-digit result (actually < 39 * p_25519) +// by not doing anything beyond the first stage of reduction + +#define mul_5(P0,P1,P2) \ + xorl %edi, %edi ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rdi, %r12 ; \ + xorl %edi, %edi ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rdi, %r13 ; \ + adcxq %rdi, %r13 ; \ + xorl %edi, %edi ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rdi, %r14 ; \ + adcxq %rdi, %r14 ; \ + xorl %edi, %edi ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rdi, %r15 ; \ + adcxq %rdi, %r15 ; \ + movl $0x26, %edx ; \ + xorl %edi, %edi ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rdi, %r12 ; \ + adcxq %rdi, %r12 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 ; \ + movq %r12, 0x20+P0 + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ebx, %ebx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rbx, %r13 ; \ + adoxq %rbx, %r14 ; \ + adcq %rbx, %r14 ; \ + xorl %ebx, %ebx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rbx, %r15 ; \ + adoxq %rbx, %r15 ; \ + movl $0x26, %edx ; \ + xorl %ebx, %ebx ; \ + mulxq %r12, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq %r13, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq %r14, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + adcxq %rbx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Plain 4-digit add without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result + +#define add_4(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq 8+P2, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq 16+P2, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq 24+P2, %rax ; \ + movq %rax, 24+P0 + +// Add 5-digit inputs and normalize to 4 digits + +#define add5_4(P0,P1,P2) \ + movq P1, %r8 ; \ + addq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + adcq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + adcq 16+P2, %r10 ; \ + movq 24+P1, %r11 ; \ + adcq 24+P2, %r11 ; \ + movq 32+P1, %r12 ; \ + adcq 32+P2, %r12 ; \ + xorl %ebx, %ebx ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(P0,P1,P2) \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + subq $19, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq $0, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq $0, %rax ; \ + btc $63, %rax ; \ + movq %rax, 24+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// 5-digit subtraction with upward bias to make it positive, adding +// 1000 * (2^255 - 19) = 2^256 * 500 - 19000, then normalizing to 4 digits + +#define sub5_4(P0,P1,P2) \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %r11 ; \ + sbbq 24+P2, %r11 ; \ + movq 32+P1, %r12 ; \ + sbbq 32+P2, %r12 ; \ + xorl %ebx, %ebx ; \ + subq $19000, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq %rbx, %r12 ; \ + addq $500, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Combined z = c * x + y with reduction only < 2 * p_25519 +// It is assumed that 19 * (c * x + y) < 2^60 * 2^256 so we +// don't need a high mul in the final part. + +#define cmadd_4(P0,C1,P2,P3) \ + movq P3, %r8 ; \ + movq 8+P3, %r9 ; \ + movq 16+P3, %r10 ; \ + movq 24+P3, %r11 ; \ + xorl %edi, %edi ; \ + movq $C1, %rdx ; \ + mulxq P2, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 8+P2, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 16+P2, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 24+P2, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rdi, %rbx ; \ + adcxq %rdi, %rbx ; \ + shldq $0x1, %r11, %rbx ; \ + btr $63, %r11 ; \ + movl $0x13, %edx ; \ + imulq %rdx, %rbx ; \ + addq %rbx, %r8 ; \ + adcq %rdi, %r9 ; \ + adcq %rdi, %r10 ; \ + adcq %rdi, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Multiplex: z := if NZ then x else y + +#define mux_4(P0,P1,P2) \ + movq P1, %rax ; \ + movq P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq 8+P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq 16+P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq 24+P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, 24+P0 + +// Paired multiplex: (w,z) := if NZ then (y,x) else (x,y) + +#define muxpair_4(P0,P1,P2,P3) \ + movq P2, %rax ; \ + movq P3, %rcx ; \ + movq %rax, %rdx ; \ + cmovnzq %rcx, %rax ; \ + cmovnzq %rdx, %rcx ; \ + movq %rax, P0 ; \ + movq %rcx, P1 ; \ + movq 8+P2, %rax ; \ + movq 8+P3, %rcx ; \ + movq %rax, %rdx ; \ + cmovnzq %rcx, %rax ; \ + cmovnzq %rdx, %rcx ; \ + movq %rax, 8+P0 ; \ + movq %rcx, 8+P1 ; \ + movq 16+P2, %rax ; \ + movq 16+P3, %rcx ; \ + movq %rax, %rdx ; \ + cmovnzq %rcx, %rax ; \ + cmovnzq %rdx, %rcx ; \ + movq %rax, 16+P0 ; \ + movq %rcx, 16+P1 ; \ + movq 24+P2, %rax ; \ + movq 24+P3, %rcx ; \ + movq %rax, %rdx ; \ + cmovnzq %rcx, %rax ; \ + cmovnzq %rdx, %rcx ; \ + movq %rax, 24+P0 ; \ + movq %rcx, 24+P1 + +S2N_BN_SYMBOL(curve25519_ladderstep): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdi, rr + movq %rsi, point + movq %rcx, bb + +// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn +// The adds don't need any normalization as they're fed to muls +// Just make sure the subs fit in 4 digits. Keep pp in %rdx +// here, after which we can forget about it. + + sub_4(dm,xm,zm) + add_4(sn,xn,zn) + sub_4(dn,xn,zn) + add_4(sm,xm,zm) + +// ADDING: dmsn = dm * sn; dnsm = sm * dn +// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) + + mul_5(dmsn,dm,sn) + + movq bb, %rax + testq %rax, %rax + mux_4(d,dm,dn) + mux_4(s,sm,sn) + + mul_5(dnsm,sm,dn) + +// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits + + sqr_4(d,d) + +// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 +// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits + + sub5_4(dpro,dmsn,dnsm) + sqr_4(s,s) + add5_4(spro,dmsn,dnsm) + sqr_4(dpro,dpro) + +// DOUBLING: p = 4 * xt * zt = s - d + + sub_twice4(p,s,d) + +// ADDING: sumx = (dmsn + dnsm)^2 + + sqr_p25519(sumx,spro) + +// DOUBLING: e = 121666 * p + d + + cmadd_4(e,0x1db42,p,d) + +// DOUBLING: dubx = (xt + zt)^2 * (xt - zt)^2 = s * d + + mul_p25519(dubx,s,d) + +// ADDING: sumz = x * (dmsn - dnsm)^2 + + movq point, %rbp + mul_p25519(sumz,dpro,point_x) + +// DOUBLING: dubz = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) +// = p * (d + 121666 * p) + + mul_p25519(dubz,p,e) + +// Multiplex the outputs + + movq bb, %rax + movq rr, %rbp + testq %rax, %rax + muxpair_4(res0,res2,dubx,sumx) + muxpair_4(res1,res3,dubz,sumz) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_ladderstep_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_ladderstep_alt.S new file mode 100644 index 00000000000..5ca8e9997d4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_ladderstep_alt.S @@ -0,0 +1,909 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery ladder step on pairs of (X,Z)-projective curve25519 points +// +// extern void curve25519_ladderstep_alt +// (uint64_t rr[16],uint64_t point[8],uint64_t pp[16],uint64_t b) +// +// If point = (X,1) and pp = (n * (X,1),[n+1] * (X,1)) then the output +// rr = (n' * (X,1),[n'+1] * (X,1)) where n' = 2 * n + b, with input +// b assumed to be 0 or 1; in this setting, each pair (X,Z) is assumed to +// be a projective y-free representation of an affine curve25519 point +// (X/Z,y), with the initial "differential" point having Z = 1 and X its +// affine x coordinate. In other words, the ladderstep operation is a +// combination of doubling, differential addition and optional swapping. +// +// Standard x86-64 ABI: RDI = rr, RSI = point, RDX = pp, RCX = b +// Microsoft x64 ABI: RCX = rr, RDX = point, R8 = pp, R9 = b +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_ladderstep_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_ladderstep_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// The single field of the input point used (z assumed 1) + +#define point_x 0(%rbp) + +// Pointer-offset pairs for pp fields +// These use the initial register %rdx as the offset. +// We then never need it again so it can be ephemeral + +#define xn 0(%rdx) +#define zn NUMSIZE(%rdx) +#define xm (2*NUMSIZE)(%rdx) +#define zm (3*NUMSIZE)(%rdx) + +// Result fields + +#define res0 0(%rbp) +#define res1 NUMSIZE(%rbp) +#define res2 (2*NUMSIZE)(%rbp) +#define res3 (3*NUMSIZE)(%rbp) + +// Pointer-offset pairs for temporaries on stack +// dmsn and dnsm need space for >= 5 digits, and we allocate 8 + +#define sm (0*NUMSIZE)(%rsp) +#define sn (1*NUMSIZE)(%rsp) +#define dm (2*NUMSIZE)(%rsp) +#define dn (3*NUMSIZE)(%rsp) +#define dmsn (4*NUMSIZE)(%rsp) +#define dnsm (6*NUMSIZE)(%rsp) +#define s (8*NUMSIZE)(%rsp) +#define d (9*NUMSIZE)(%rsp) +#define p (10*NUMSIZE)(%rsp) + +// Preserved inputs + +#define rr (12*NUMSIZE)(%rsp) +#define point (12*NUMSIZE)+8(%rsp) +#define pp (12*NUMSIZE)+16(%rsp) +#define bb (12*NUMSIZE)+24(%rsp) + +// More, but aliases to above + +#define sumx sm +#define sumz sn +#define dubx dm +#define dubz dn +#define e dubz +#define spro dnsm +#define dpro sumz + +// Total size to reserve on the stack + +#define NSPACE (13*NUMSIZE) + +// Macros wrapping up the basic field operation calls +// bignum_mul_p25519_alt and bignum_sqr_p25519_alt. +// These two are only trivially different from pure +// function calls to those subroutines. + +#define mul_p25519(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movl $0x13, %esi ; \ + bts $63, %r11 ; \ + imulq %rsi, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rsi, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +#define sqr_p25519(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r14 ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r15 ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movl $0x13, %esi ; \ + bts $63, %r11 ; \ + imulq %rsi, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rsi, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Multiplication just giving a 5-digit result (actually < 39 * p_25519) +// by not doing anything beyond the first stage of reduction + +#define mul_5(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 ; \ + movq %r12, 0x20+P0 + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r14 ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r15 ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Plain 4-digit add without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result + +#define add_4(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq 8+P2, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq 16+P2, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq 24+P2, %rax ; \ + movq %rax, 24+P0 + +// Add 5-digit inputs and normalize to 4 digits + +#define add5_4(P0,P1,P2) \ + movq P1, %r8 ; \ + addq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + adcq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + adcq 16+P2, %r10 ; \ + movq 24+P1, %r11 ; \ + adcq 24+P2, %r11 ; \ + movq 32+P1, %r12 ; \ + adcq 32+P2, %r12 ; \ + xorl %ebx, %ebx ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(P0,P1,P2) \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + subq $19, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq $0, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq $0, %rax ; \ + btc $63, %rax ; \ + movq %rax, 24+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// 5-digit subtraction with upward bias to make it positive, adding +// 1000 * (2^255 - 19) = 2^256 * 500 - 19000, then normalizing to 4 digits + +#define sub5_4(P0,P1,P2) \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %r11 ; \ + sbbq 24+P2, %r11 ; \ + movq 32+P1, %r12 ; \ + sbbq 32+P2, %r12 ; \ + xorl %ebx, %ebx ; \ + subq $19000, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq %rbx, %r12 ; \ + addq $500, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Combined z = c * x + y with reduction only < 2 * p_25519 +// It is assumed that 19 * (c * x + y) < 2^60 * 2^256 so we +// don't need a high mul in the final part. + +#define cmadd_4(P0,C1,P2,P3) \ + movq $C1, %rsi ; \ + movq P2, %rax ; \ + mulq %rsi; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 0x8+P2, %rax ; \ + xorq %r10, %r10 ; \ + mulq %rsi; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x10+P2, %rax ; \ + mulq %rsi; \ + addq %rax, %r10 ; \ + adcq $0x0, %rdx ; \ + movq 0x18+P2, %rax ; \ + movq %rdx, %r11 ; \ + mulq %rsi; \ + xorl %esi, %esi ; \ + addq %rax, %r11 ; \ + adcq %rsi, %rdx ; \ + addq P3, %r8 ; \ + adcq 0x8+P3, %r9 ; \ + adcq 0x10+P3, %r10 ; \ + adcq 0x18+P3, %r11 ; \ + adcq %rsi, %rdx ; \ + shldq $0x1, %r11, %rdx ; \ + btr $63, %r11 ; \ + movl $0x13, %ebx ; \ + imulq %rbx, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rsi, %r9 ; \ + adcq %rsi, %r10 ; \ + adcq %rsi, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Multiplex: z := if NZ then x else y + +#define mux_4(P0,P1,P2) \ + movq P1, %rax ; \ + movq P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq 8+P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq 16+P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq 24+P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, 24+P0 + +// Paired multiplex: (w,z) := if NZ then (y,x) else (x,y) + +#define muxpair_4(P0,P1,P2,P3) \ + movq P2, %rax ; \ + movq P3, %rcx ; \ + movq %rax, %rdx ; \ + cmovnzq %rcx, %rax ; \ + cmovnzq %rdx, %rcx ; \ + movq %rax, P0 ; \ + movq %rcx, P1 ; \ + movq 8+P2, %rax ; \ + movq 8+P3, %rcx ; \ + movq %rax, %rdx ; \ + cmovnzq %rcx, %rax ; \ + cmovnzq %rdx, %rcx ; \ + movq %rax, 8+P0 ; \ + movq %rcx, 8+P1 ; \ + movq 16+P2, %rax ; \ + movq 16+P3, %rcx ; \ + movq %rax, %rdx ; \ + cmovnzq %rcx, %rax ; \ + cmovnzq %rdx, %rcx ; \ + movq %rax, 16+P0 ; \ + movq %rcx, 16+P1 ; \ + movq 24+P2, %rax ; \ + movq 24+P3, %rcx ; \ + movq %rax, %rdx ; \ + cmovnzq %rcx, %rax ; \ + cmovnzq %rdx, %rcx ; \ + movq %rax, 24+P0 ; \ + movq %rcx, 24+P1 + +S2N_BN_SYMBOL(curve25519_ladderstep_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdi, rr + movq %rsi, point + movq %rcx, bb + +// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn +// The adds don't need any normalization as they're fed to muls +// Just make sure the subs fit in 4 digits. Keep pp in %rdx +// here, after which we can forget about it. + + sub_4(dm,xm,zm) + add_4(sn,xn,zn) + sub_4(dn,xn,zn) + add_4(sm,xm,zm) + +// ADDING: dmsn = dm * sn; dnsm = sm * dn +// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) + + mul_5(dmsn,dm,sn) + + movq bb, %rax + testq %rax, %rax + mux_4(d,dm,dn) + mux_4(s,sm,sn) + + mul_5(dnsm,sm,dn) + +// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits + + sqr_4(d,d) + +// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 +// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits + + sub5_4(dpro,dmsn,dnsm) + sqr_4(s,s) + add5_4(spro,dmsn,dnsm) + sqr_4(dpro,dpro) + +// DOUBLING: p = 4 * xt * zt = s - d + + sub_twice4(p,s,d) + +// ADDING: sumx = (dmsn + dnsm)^2 + + sqr_p25519(sumx,spro) + +// DOUBLING: e = 121666 * p + d + + cmadd_4(e,0x1db42,p,d) + +// DOUBLING: dubx = (xt + zt)^2 * (xt - zt)^2 = s * d + + mul_p25519(dubx,s,d) + +// ADDING: sumz = x * (dmsn - dnsm)^2 + + movq point, %rbp + mul_p25519(sumz,dpro,point_x) + +// DOUBLING: dubz = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) +// = p * (d + 121666 * p) + + mul_p25519(dubz,p,e) + +// Multiplex the outputs + + movq bb, %rax + movq rr, %rbp + testq %rax, %rax + muxpair_4(res0,res2,dubx,sumx) + muxpair_4(res1,res3,dubz,sumz) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_pxscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_pxscalarmul.S new file mode 100644 index 00000000000..33b41f8ed68 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_pxscalarmul.S @@ -0,0 +1,771 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Projective scalar multiplication, x coordinate only, for curve25519 +// Inputs scalar[4], point[4]; output res[8] +// +// extern void curve25519_pxscalarmul +// (uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4]) +// +// Given the X coordinate of an input point = (X,Y) on curve25519, which +// could also be part of a projective representation (X,Y,1) of the same +// point, returns a projective representation (X,Z) = scalar * point, where +// scalar is a 256-bit number. The corresponding affine form is (X/Z,Y'), +// X/Z meaning division modulo 2^255-19, and Y' not being computed by +// this function (nor is any Y coordinate of the input point used). +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = point +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_pxscalarmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_pxscalarmul) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence +// and additional registers for loop counter and swap flag + +#define res 10*NUMSIZE(%rsp) +#define point 10*NUMSIZE+8(%rsp) +#define scalar 10*NUMSIZE+16(%rsp) +#define i 10*NUMSIZE+24(%rsp) +#define swap 10*NUMSIZE+32(%rsp) + +// Pointers to input x coord (we don't use y or z) and output coords. +// These all assume the base address (point and res respectively) is +// currently in the %rbp register. + +#define x 0(%rbp) +#define resx 0(%rbp) +#define resz NUMSIZE(%rbp) + +// Pointer-offset pairs for temporaries on stack with some aliasing. +// Both dmsn and dnsm need space for >= 5 digits, and we allocate 8 + +#define zm (0*NUMSIZE)(%rsp) +#define sm (0*NUMSIZE)(%rsp) +#define dpro (0*NUMSIZE)(%rsp) + +#define sn (1*NUMSIZE)(%rsp) + +#define dm (2*NUMSIZE)(%rsp) + +#define zn (3*NUMSIZE)(%rsp) +#define dn (3*NUMSIZE)(%rsp) +#define e (3*NUMSIZE)(%rsp) + +#define dmsn (4*NUMSIZE)(%rsp) +#define p (4*NUMSIZE)(%rsp) + +#define xm (6*NUMSIZE)(%rsp) +#define dnsm (6*NUMSIZE)(%rsp) +#define spro (6*NUMSIZE)(%rsp) + +#define xn (8*NUMSIZE)(%rsp) +#define s (8*NUMSIZE)(%rsp) + +#define d (9*NUMSIZE)(%rsp) + +// Total size to reserve on the stack +// This includes space for the 5 other variables above + +#define NSPACE (10*NUMSIZE+40) + +// Macros wrapping up the basic field operation calls +// bignum_mul_p25519 and bignum_sqr_p25519. +// These two are only trivially different from pure +// function calls to those subroutines. + +#define mul_p25519(P0,P1,P2) \ + xorl %edi, %edi ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rdi, %r12 ; \ + xorl %edi, %edi ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rdi, %r13 ; \ + adcxq %rdi, %r13 ; \ + xorl %edi, %edi ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rdi, %r14 ; \ + adcxq %rdi, %r14 ; \ + xorl %edi, %edi ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rdi, %r15 ; \ + adcxq %rdi, %r15 ; \ + movl $0x26, %edx ; \ + xorl %edi, %edi ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rdi, %r12 ; \ + adcxq %rdi, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + movl $0x13, %edx ; \ + incq %r12; \ + bts $63, %r11 ; \ + mulxq %r12, %rax, %rbx ; \ + addq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rdi, %r10 ; \ + adcq %rdi, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rdx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rdi, %r9 ; \ + sbbq %rdi, %r10 ; \ + sbbq %rdi, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +#define sqr_p25519(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ebx, %ebx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rbx, %r13 ; \ + adoxq %rbx, %r14 ; \ + adcq %rbx, %r14 ; \ + xorl %ebx, %ebx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rbx, %r15 ; \ + adoxq %rbx, %r15 ; \ + movl $0x26, %edx ; \ + xorl %ebx, %ebx ; \ + mulxq %r12, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq %r13, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq %r14, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + adcxq %rbx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + movl $0x13, %edx ; \ + leaq 0x1(%r12), %rax ; \ + bts $0x3f, %r11 ; \ + imulq %rdx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + cmovbq %rbx, %rdx ; \ + subq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %r11 ; \ + btr $0x3f, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Multiplication just giving a 5-digit result (actually < 39 * p_25519) +// by not doing anything beyond the first stage of reduction + +#define mul_5(P0,P1,P2) \ + xorl %edi, %edi ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rdi, %r12 ; \ + xorl %edi, %edi ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rdi, %r13 ; \ + adcxq %rdi, %r13 ; \ + xorl %edi, %edi ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rdi, %r14 ; \ + adcxq %rdi, %r14 ; \ + xorl %edi, %edi ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rdi, %r15 ; \ + adcxq %rdi, %r15 ; \ + movl $0x26, %edx ; \ + xorl %edi, %edi ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rdi, %r12 ; \ + adcxq %rdi, %r12 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 ; \ + movq %r12, 0x20+P0 + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ebx, %ebx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rbx, %r13 ; \ + adoxq %rbx, %r14 ; \ + adcq %rbx, %r14 ; \ + xorl %ebx, %ebx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rbx, %r15 ; \ + adoxq %rbx, %r15 ; \ + movl $0x26, %edx ; \ + xorl %ebx, %ebx ; \ + mulxq %r12, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq %r13, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq %r14, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + adcxq %rbx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Plain 4-digit add without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result + +#define add_4(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq 8+P2, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq 16+P2, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq 24+P2, %rax ; \ + movq %rax, 24+P0 + +// Add 5-digit inputs and normalize to 4 digits + +#define add5_4(P0,P1,P2) \ + movq P1, %r8 ; \ + addq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + adcq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + adcq 16+P2, %r10 ; \ + movq 24+P1, %r11 ; \ + adcq 24+P2, %r11 ; \ + movq 32+P1, %r12 ; \ + adcq 32+P2, %r12 ; \ + xorl %ebx, %ebx ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(P0,P1,P2) \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + subq $19, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq $0, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq $0, %rax ; \ + btc $63, %rax ; \ + movq %rax, 24+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// 5-digit subtraction with upward bias to make it positive, adding +// 1000 * (2^255 - 19) = 2^256 * 500 - 19000, then normalizing to 4 digits + +#define sub5_4(P0,P1,P2) \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %r11 ; \ + sbbq 24+P2, %r11 ; \ + movq 32+P1, %r12 ; \ + sbbq 32+P2, %r12 ; \ + xorl %ebx, %ebx ; \ + subq $19000, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq %rbx, %r12 ; \ + addq $500, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Combined z = c * x + y with reduction only < 2 * p_25519 +// It is assumed that 19 * (c * x + y) < 2^60 * 2^256 so we +// don't need a high mul in the final part. + +#define cmadd_4(P0,C1,P2,P3) \ + movq P3, %r8 ; \ + movq 8+P3, %r9 ; \ + movq 16+P3, %r10 ; \ + movq 24+P3, %r11 ; \ + xorl %edi, %edi ; \ + movq $C1, %rdx ; \ + mulxq P2, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq 8+P2, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 16+P2, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 24+P2, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rdi, %rbx ; \ + adcxq %rdi, %rbx ; \ + shldq $0x1, %r11, %rbx ; \ + btr $63, %r11 ; \ + movl $0x13, %edx ; \ + imulq %rdx, %rbx ; \ + addq %rbx, %r8 ; \ + adcq %rdi, %r9 ; \ + adcq %rdi, %r10 ; \ + adcq %rdi, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Multiplex: z := if NZ then x else y + +#define mux_4(P0,P1,P2) \ + movq P1, %rax ; \ + movq P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq 8+P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq 16+P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq 24+P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, 24+P0 + +S2N_BN_SYMBOL(curve25519_pxscalarmul): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $NSPACE, %rsp + +// Move the input arguments to stable places + + movq %rdi, res + movq %rsi, scalar + movq %rdx, point + +// Initialize (xn,zn) = (1,0) and (xm,zm) = (x,1) with swap = 0 + + movq $1, %rax + movq %rax, 256(%rsp) + movq %rax, (%rsp) + xorl %eax, %eax + movq %rax, swap + movq %rax, 96(%rsp) + movq %rax, 264(%rsp) + movq %rax, 8(%rsp) + movq %rax, 104(%rsp) + movq %rax, 272(%rsp) + movq %rax, 16(%rsp) + movq %rax, 112(%rsp) + movq %rax, 280(%rsp) + movq %rax, 24(%rsp) + movq %rax, 120(%rsp) + movq (%rdx), %rax + movq %rax, 192(%rsp) + movq 8(%rdx), %rax + movq %rax, 200(%rsp) + movq 16(%rdx), %rax + movq %rax, 208(%rsp) + movq 24(%rdx), %rax + movq %rax, 216(%rsp) + +// The outer loop from i = 255, ..., i = 0 (inclusive) + + movl $255, %eax + movq %rax, i + +curve25519_pxscalarmul_loop: + +// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn +// The adds don't need any normalization as they're fed to muls +// Just make sure the subs fit in 4 digits. + + sub_4(dm,xm,zm) + add_4(sn,xn,zn) + sub_4(dn,xn,zn) + add_4(sm,xm,zm) + +// ADDING: dmsn = dm * sn; dnsm = sm * dn +// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) + + mul_5(dmsn,sn,dm) + + movq scalar, %rax + movq i, %rdx + movq %rdx, %rcx + shrq $6, %rdx + movq (%rax,%rdx,8), %rdx + shrq %cl, %rdx + andq $1, %rdx + cmpq swap, %rdx + movq %rdx, swap + + mux_4(d,dm,dn) + mux_4(s,sm,sn) + + mul_5(dnsm,sm,dn) + +// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits + + sqr_4(d,d) + +// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 +// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits + + sub5_4(dpro,dmsn,dnsm) + sqr_4(s,s) + add5_4(spro,dmsn,dnsm) + sqr_4(dpro,dpro) + +// DOUBLING: p = 4 * xt * zt = s - d + + sub_twice4(p,s,d) + +// ADDING: xm' = (dmsn + dnsm)^2 + + sqr_p25519(xm,spro) + +// DOUBLING: e = 121666 * p + d + + cmadd_4(e,0x1db42,p,d) + +// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d + + mul_p25519(xn,s,d) + +// ADDING: zm' = x * (dmsn - dnsm)^2 + + movq point, %rbp + mul_p25519(zm,dpro,x) + +// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) +// = p * (d + 121666 * p) + + mul_p25519(zn,p,e) + +// Loop down as far as 0 (inclusive) + + movq i, %rax + subq $1, %rax + movq %rax, i + jnc curve25519_pxscalarmul_loop + +// The main loop does not handle the special input of the 2-torsion +// point = (0,0). In that case we may get a spurious (0,0) as output +// when we want (0,1) [for odd scalar] or (1,0) [for even scalar]. +// Test if x = 0 (this is equivalent for curve25519 to y = 0) and if +// so, patch zm = 1 [for odd multiple], xn = 1 [for even multiple]. + + movl $1, %ecx + movq point, %rbp + movq (%rbp), %rax + orq 8(%rbp), %rax + orq 16(%rbp), %rax + orq 24(%rbp), %rax + cmovnzq %rcx, %rax + xorq $1, %rax + orq %rax, (%rsp) + orq %rax, 256(%rsp) + +// Multiplex into the final outputs + + movq res, %rbp + movq swap, %rax + testq %rax, %rax + + mux_4(resx,xm,xn) + mux_4(resz,zm,zn) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_pxscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_pxscalarmul_alt.S new file mode 100644 index 00000000000..65f896ddd81 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_pxscalarmul_alt.S @@ -0,0 +1,937 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Projective scalar multiplication, x coordinate only, for curve25519 +// Inputs scalar[4], point[4]; output res[8] +// +// extern void curve25519_pxscalarmul_alt +// (uint64_t res[static 8],uint64_t scalar[static 4],uint64_t point[static 4]) +// +// Given the X coordinate of an input point = (X,Y) on curve25519, which +// could also be part of a projective representation (X,Y,1) of the same +// point, returns a projective representation (X,Z) = scalar * point, where +// scalar is a 256-bit number. The corresponding affine form is (X/Z,Y'), +// X/Z meaning division modulo 2^255-19, and Y' not being computed by +// this function (nor is any Y coordinate of the input point used). +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = point +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(curve25519_pxscalarmul_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(curve25519_pxscalarmul_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Stable homes for input arguments during main code sequence +// and additional registers for loop counter and swap flag + +#define res 10*NUMSIZE(%rsp) +#define point 10*NUMSIZE+8(%rsp) +#define scalar 10*NUMSIZE+16(%rsp) +#define i 10*NUMSIZE+24(%rsp) +#define swap 10*NUMSIZE+32(%rsp) + +// Pointers to input x coord (we don't use y or z) and output coords. +// These all assume the base address (point and res respectively) is +// currently in the %rbp register. + +#define x 0(%rbp) +#define resx 0(%rbp) +#define resz NUMSIZE(%rbp) + +// Pointer-offset pairs for temporaries on stack with some aliasing. +// Both dmsn and dnsm need space for >= 5 digits, and we allocate 8 + +#define zm (0*NUMSIZE)(%rsp) +#define sm (0*NUMSIZE)(%rsp) +#define dpro (0*NUMSIZE)(%rsp) + +#define sn (1*NUMSIZE)(%rsp) + +#define dm (2*NUMSIZE)(%rsp) + +#define zn (3*NUMSIZE)(%rsp) +#define dn (3*NUMSIZE)(%rsp) +#define e (3*NUMSIZE)(%rsp) + +#define dmsn (4*NUMSIZE)(%rsp) +#define p (4*NUMSIZE)(%rsp) + +#define xm (6*NUMSIZE)(%rsp) +#define dnsm (6*NUMSIZE)(%rsp) +#define spro (6*NUMSIZE)(%rsp) + +#define xn (8*NUMSIZE)(%rsp) +#define s (8*NUMSIZE)(%rsp) + +#define d (9*NUMSIZE)(%rsp) + +// Total size to reserve on the stack +// This includes space for the 5 other variables above + +#define NSPACE (10*NUMSIZE+40) + +// Macros wrapping up the basic field operation calls +// bignum_mul_p25519_alt and bignum_sqr_p25519_alt. +// These two are only trivially different from pure +// function calls to those subroutines. + +#define mul_p25519(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movl $0x13, %esi ; \ + bts $63, %r11 ; \ + imulq %rsi, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rsi, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +#define sqr_p25519(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r14 ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r15 ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movl $0x13, %esi ; \ + bts $63, %r11 ; \ + imulq %rsi, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rsi, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Multiplication just giving a 5-digit result (actually < 39 * p_25519) +// by not doing anything beyond the first stage of reduction + +#define mul_5(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 ; \ + movq %r12, 0x20+P0 + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r14 ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r15 ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Plain 4-digit add without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result + +#define add_4(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq 8+P2, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq 16+P2, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq 24+P2, %rax ; \ + movq %rax, 24+P0 + +// Add 5-digit inputs and normalize to 4 digits + +#define add5_4(P0,P1,P2) \ + movq P1, %r8 ; \ + addq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + adcq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + adcq 16+P2, %r10 ; \ + movq 24+P1, %r11 ; \ + adcq 24+P2, %r11 ; \ + movq 32+P1, %r12 ; \ + adcq 32+P2, %r12 ; \ + xorl %ebx, %ebx ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(P0,P1,P2) \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + subq $19, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq $0, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq $0, %rax ; \ + btc $63, %rax ; \ + movq %rax, 24+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// 5-digit subtraction with upward bias to make it positive, adding +// 1000 * (2^255 - 19) = 2^256 * 500 - 19000, then normalizing to 4 digits + +#define sub5_4(P0,P1,P2) \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %r11 ; \ + sbbq 24+P2, %r11 ; \ + movq 32+P1, %r12 ; \ + sbbq 32+P2, %r12 ; \ + xorl %ebx, %ebx ; \ + subq $19000, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq %rbx, %r12 ; \ + addq $500, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Combined z = c * x + y with reduction only < 2 * p_25519 +// It is assumed that 19 * (c * x + y) < 2^60 * 2^256 so we +// don't need a high mul in the final part. + +#define cmadd_4(P0,C1,P2,P3) \ + movq $C1, %rsi ; \ + movq P2, %rax ; \ + mulq %rsi; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 0x8+P2, %rax ; \ + xorq %r10, %r10 ; \ + mulq %rsi; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x10+P2, %rax ; \ + mulq %rsi; \ + addq %rax, %r10 ; \ + adcq $0x0, %rdx ; \ + movq 0x18+P2, %rax ; \ + movq %rdx, %r11 ; \ + mulq %rsi; \ + xorl %esi, %esi ; \ + addq %rax, %r11 ; \ + adcq %rsi, %rdx ; \ + addq P3, %r8 ; \ + adcq 0x8+P3, %r9 ; \ + adcq 0x10+P3, %r10 ; \ + adcq 0x18+P3, %r11 ; \ + adcq %rsi, %rdx ; \ + shldq $0x1, %r11, %rdx ; \ + btr $63, %r11 ; \ + movl $0x13, %ebx ; \ + imulq %rbx, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rsi, %r9 ; \ + adcq %rsi, %r10 ; \ + adcq %rsi, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Multiplex: z := if NZ then x else y + +#define mux_4(P0,P1,P2) \ + movq P1, %rax ; \ + movq P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + movq 8+P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + movq 16+P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + movq 24+P2, %rcx ; \ + cmovzq %rcx, %rax ; \ + movq %rax, 24+P0 + +S2N_BN_SYMBOL(curve25519_pxscalarmul_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $NSPACE, %rsp + +// Move the input arguments to stable places + + movq %rdi, res + movq %rsi, scalar + movq %rdx, point + +// Initialize (xn,zn) = (1,0) and (xm,zm) = (x,1) with swap = 0 + + movq $1, %rax + movq %rax, 256(%rsp) + movq %rax, (%rsp) + xorl %eax, %eax + movq %rax, swap + movq %rax, 96(%rsp) + movq %rax, 264(%rsp) + movq %rax, 8(%rsp) + movq %rax, 104(%rsp) + movq %rax, 272(%rsp) + movq %rax, 16(%rsp) + movq %rax, 112(%rsp) + movq %rax, 280(%rsp) + movq %rax, 24(%rsp) + movq %rax, 120(%rsp) + movq (%rdx), %rax + movq %rax, 192(%rsp) + movq 8(%rdx), %rax + movq %rax, 200(%rsp) + movq 16(%rdx), %rax + movq %rax, 208(%rsp) + movq 24(%rdx), %rax + movq %rax, 216(%rsp) + +// The outer loop from i = 255, ..., i = 0 (inclusive) + + movl $255, %eax + movq %rax, i + +curve25519_pxscalarmul_alt_loop: + +// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn +// The adds don't need any normalization as they're fed to muls +// Just make sure the subs fit in 4 digits. + + sub_4(dm,xm,zm) + add_4(sn,xn,zn) + sub_4(dn,xn,zn) + add_4(sm,xm,zm) + +// ADDING: dmsn = dm * sn; dnsm = sm * dn +// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) + + mul_5(dmsn,sn,dm) + + movq scalar, %rax + movq i, %rdx + movq %rdx, %rcx + shrq $6, %rdx + movq (%rax,%rdx,8), %rdx + shrq %cl, %rdx + andq $1, %rdx + cmpq swap, %rdx + movq %rdx, swap + + mux_4(d,dm,dn) + mux_4(s,sm,sn) + + mul_5(dnsm,sm,dn) + +// DOUBLING: d = (xt - zt)^2 normalized only to 4 digits + + sqr_4(d,d) + +// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 +// DOUBLING: s = (xt + zt)^2, normalized only to 4 digits + + sub5_4(dpro,dmsn,dnsm) + sqr_4(s,s) + add5_4(spro,dmsn,dnsm) + sqr_4(dpro,dpro) + +// DOUBLING: p = 4 * xt * zt = s - d + + sub_twice4(p,s,d) + +// ADDING: xm' = (dmsn + dnsm)^2 + + sqr_p25519(xm,spro) + +// DOUBLING: e = 121666 * p + d + + cmadd_4(e,0x1db42,p,d) + +// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d + + mul_p25519(xn,s,d) + +// ADDING: zm' = x * (dmsn - dnsm)^2 + + movq point, %rbp + mul_p25519(zm,dpro,x) + +// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) +// = p * (d + 121666 * p) + + mul_p25519(zn,p,e) + +// Loop down as far as 0 (inclusive) + + movq i, %rax + subq $1, %rax + movq %rax, i + jnc curve25519_pxscalarmul_alt_loop + +// The main loop does not handle the special input of the 2-torsion +// point = (0,0). In that case we may get a spurious (0,0) as output +// when we want (0,1) [for odd scalar] or (1,0) [for even scalar]. +// Test if x = 0 (this is equivalent for curve25519 to y = 0) and if +// so, patch zm = 1 [for odd multiple], xn = 1 [for even multiple]. + + movl $1, %ecx + movq point, %rbp + movq (%rbp), %rax + orq 8(%rbp), %rax + orq 16(%rbp), %rax + orq 24(%rbp), %rax + cmovnzq %rcx, %rax + xorq $1, %rax + orq %rax, (%rsp) + orq %rax, 256(%rsp) + +// Multiplex into the final outputs + + movq res, %rbp + movq swap, %rax + testq %rax, %rax + + mux_4(resx,xm,xn) + mux_4(resz,zm,zn) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519.S index b9f7cdaa163..db8d3767374 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519.S @@ -595,6 +595,7 @@ S2N_BN_SYMBOL(curve25519_x25519): S2N_BN_SYMBOL(curve25519_x25519_byte): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519_alt.S index f7c6c3d7b02..88c29f1ec0a 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519_alt.S @@ -756,6 +756,7 @@ S2N_BN_SYMBOL(curve25519_x25519_alt): S2N_BN_SYMBOL(curve25519_x25519_byte_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519base.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519base.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519base.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519base.S index dda3b1707b6..eb7e509aa61 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519base.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519base.S @@ -338,6 +338,7 @@ S2N_BN_SYMBOL(curve25519_x25519base): S2N_BN_SYMBOL(curve25519_x25519base_byte): + _CET_ENDBR // In this case the Windows form literally makes a subroutine call. // This avoids hassle arising from keeping code and data together. diff --git a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519base_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519base_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519base_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519base_alt.S index b6c82faba0c..34ee779a183 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/curve25519_x25519base_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/curve25519_x25519base_alt.S @@ -414,6 +414,7 @@ S2N_BN_SYMBOL(curve25519_x25519base_alt): S2N_BN_SYMBOL(curve25519_x25519base_byte_alt): + _CET_ENDBR // In this case the Windows form literally makes a subroutine call. // This avoids hassle arising from keeping code and data together. diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_decode.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_decode.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_decode.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_decode.S index ae63e0dacba..25cc51bc3ac 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_decode.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_decode.S @@ -69,6 +69,7 @@ #define Q8 (25*N) S2N_BN_SYMBOL(edwards25519_decode): + _CET_ENDBR // In this case the Windows form literally makes a subroutine call. // This avoids hassle arising from subroutine offsets diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_decode_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_decode_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_decode_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_decode_alt.S index 8bfe721253a..bd5fae468d0 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_decode_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_decode_alt.S @@ -69,6 +69,7 @@ #define Q8 (25*N) S2N_BN_SYMBOL(edwards25519_decode_alt): + _CET_ENDBR // In this case the Windows form literally makes a subroutine call. // This avoids hassle arising from subroutine offsets diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_encode.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_encode.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_encode.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_encode.S index 13b0102d098..dc05eb2d45d 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_encode.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_encode.S @@ -41,6 +41,7 @@ #define xb %r9 S2N_BN_SYMBOL(edwards25519_encode): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epadd.S new file mode 100644 index 00000000000..02b0504aaad --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epadd.S @@ -0,0 +1,436 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended projective addition for edwards25519 +// Inputs p1[16], p2[16]; output p3[16] +// +// extern void edwards25519_epadd +// (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16]) +// +// The output p3 and both inputs p1 and p2 are points (x,y) on +// edwards25519 represented in extended projective quadruples (X,Y,Z,T) +// where x = X / Z, y = Y / Z and x * y = T / Z. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epadd) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Registers used for inputs and outputs within basic operations. +// Here p1 and p3 are where the parameters come in anyway; +// the p2 = %rbp assignment is set up at the beginning. + +#define p3 %rdi +#define p1 %rsi +#define p2 %rbp + +// Pointers to input and output coordinates + +#define x_1 0(p1) +#define y_1 NUMSIZE(p1) +#define z_1 (2*NUMSIZE)(p1) +#define w_1 (3*NUMSIZE)(p1) + +#define x_2 0(p2) +#define y_2 NUMSIZE(p2) +#define z_2 (2*NUMSIZE)(p2) +#define w_2 (3*NUMSIZE)(p2) + +#define x_3 0(p3) +#define y_3 NUMSIZE(p3) +#define z_3 (2*NUMSIZE)(p3) +#define w_3 (3*NUMSIZE)(p3) + +// Pointer-offset pairs for temporaries on stack + +#define t0 (0*NUMSIZE)(%rsp) +#define t1 (1*NUMSIZE)(%rsp) +#define t2 (2*NUMSIZE)(%rsp) +#define t3 (3*NUMSIZE)(%rsp) +#define t4 (4*NUMSIZE)(%rsp) +#define t5 (5*NUMSIZE)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (6*NUMSIZE) + +// Macro wrapping up the basic field multiplication, only trivially +// different from a pure function call to bignum_mul_p25519. + +#define mul_p25519(P0,P1,P2) \ + xorl %esi, %esi ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rsi, %r12 ; \ + xorl %esi, %esi ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rsi, %r13 ; \ + adcxq %rsi, %r13 ; \ + xorl %esi, %esi ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rsi, %r14 ; \ + adcxq %rsi, %r14 ; \ + xorl %esi, %esi ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rsi, %r15 ; \ + adcxq %rsi, %r15 ; \ + movl $0x26, %edx ; \ + xorl %esi, %esi ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rsi, %r12 ; \ + adcxq %rsi, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + movl $0x13, %edx ; \ + incq %r12; \ + bts $63, %r11 ; \ + mulxq %r12, %rax, %rbx ; \ + addq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rsi, %r10 ; \ + adcq %rsi, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rdx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rsi, %r9 ; \ + sbbq %rsi, %r10 ; \ + sbbq %rsi, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + xorl %ecx, %ecx ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rcx, %r13 ; \ + xorl %ecx, %ecx ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcxq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcxq %rcx, %r15 ; \ + movl $0x26, %edx ; \ + xorl %ecx, %ecx ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + adcxq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Plain 4-digit add and doubling without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq 8+P2, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq 16+P2, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq 24+P2, %rax ; \ + movq %rax, 24+P0 + +#define double_4(P0,P1) \ + movq P1, %rax ; \ + addq %rax, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq %rax, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq %rax, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq %rax, %rax ; \ + movq %rax, 24+P0 + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(P0,P1,P2) \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + subq $19, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq $0, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq $0, %rax ; \ + btc $63, %rax ; \ + movq %rax, 24+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38 +// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519. + +#define add_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq 0x18+P2, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Load the constant k_25519 = 2 * d_25519 using immediate operations + +#define load_k25519(P0) \ + movq $0xebd69b9426b2f159, %rax ; \ + movq %rax, P0 ; \ + movq $0x00e0149a8283b156, %rax ; \ + movq %rax, 8+P0 ; \ + movq $0x198e80f2eef3d130, %rax ; \ + movq %rax, 16+P0 ; \ + movq $0x2406d9dc56dffce7, %rax ; \ + movq %rax, 24+P0 + +S2N_BN_SYMBOL(edwards25519_epadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main sequence of operations. after setting up p2 in its register + + movq %rdx, p2 + + mul_4(t0,w_1,w_2) + + sub_4(t1,y_1,x_1) + sub_4(t2,y_2,x_2) + add_4(t3,y_1,x_1) + add_4(t4,y_2,x_2) + double_4(t5,z_2) + + mul_4(t1,t1,t2) + mul_4(t3,t3,t4) + + load_k25519(t2) + mul_4(t2,t2,t0) + + mul_4(t4,z_1,t5) + + sub_twice4(t0,t3,t1) + add_twice4(t5,t3,t1) + sub_twice4(t1,t4,t2) + add_twice4(t3,t4,t2) + + mul_p25519(w_3,t0,t5) + mul_p25519(x_3,t0,t1) + mul_p25519(y_3,t3,t5) + mul_p25519(z_3,t1,t3) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epadd_alt.S new file mode 100644 index 00000000000..3da55cafb62 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epadd_alt.S @@ -0,0 +1,512 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended projective addition for edwards25519 +// Inputs p1[16], p2[16]; output p3[16] +// +// extern void edwards25519_epadd_alt +// (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 16]) +// +// The output p3 and both inputs p1 and p2 are points (x,y) on +// edwards25519 represented in extended projective quadruples (X,Y,Z,T) +// where x = X / Z, y = Y / Z and x * y = T / Z. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epadd_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Registers used for inputs and outputs within basic operations. +// Here p1 and p3 are where the parameters come in anyway; +// the p2 = %rbp assignment is set up at the beginning. + +#define p3 %rdi +#define p1 %rsi +#define p2 %rbp + +// Pointers to input and output coordinates + +#define x_1 0(p1) +#define y_1 NUMSIZE(p1) +#define z_1 (2*NUMSIZE)(p1) +#define w_1 (3*NUMSIZE)(p1) + +#define x_2 0(p2) +#define y_2 NUMSIZE(p2) +#define z_2 (2*NUMSIZE)(p2) +#define w_2 (3*NUMSIZE)(p2) + +#define x_3 0(p3) +#define y_3 NUMSIZE(p3) +#define z_3 (2*NUMSIZE)(p3) +#define w_3 (3*NUMSIZE)(p3) + +// Pointer-offset pairs for temporaries on stack + +#define t0 (0*NUMSIZE)(%rsp) +#define t1 (1*NUMSIZE)(%rsp) +#define t2 (2*NUMSIZE)(%rsp) +#define t3 (3*NUMSIZE)(%rsp) +#define t4 (4*NUMSIZE)(%rsp) +#define t5 (5*NUMSIZE)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (6*NUMSIZE) + +// Macro wrapping up the basic field multiplication, only trivially +// different from a pure function call to bignum_mul_p25519_alt. + +#define mul_p25519(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movl $0x13, %esi ; \ + bts $63, %r11 ; \ + imulq %rsi, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rsi, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %ebx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Plain 4-digit add and doubling without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq 8+P2, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq 16+P2, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq 24+P2, %rax ; \ + movq %rax, 24+P0 + +#define double_4(P0,P1) \ + movq P1, %rax ; \ + addq %rax, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq %rax, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq %rax, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq %rax, %rax ; \ + movq %rax, 24+P0 + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(P0,P1,P2) \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + subq $19, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq $0, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq $0, %rax ; \ + btc $63, %rax ; \ + movq %rax, 24+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38 +// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519. + +#define add_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq 0x18+P2, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Load the constant k_25519 = 2 * d_25519 using immediate operations + +#define load_k25519(P0) \ + movq $0xebd69b9426b2f159, %rax ; \ + movq %rax, P0 ; \ + movq $0x00e0149a8283b156, %rax ; \ + movq %rax, 8+P0 ; \ + movq $0x198e80f2eef3d130, %rax ; \ + movq %rax, 16+P0 ; \ + movq $0x2406d9dc56dffce7, %rax ; \ + movq %rax, 24+P0 + +S2N_BN_SYMBOL(edwards25519_epadd_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main sequence of operations. after setting up p2 in its register + + movq %rdx, p2 + + mul_4(t0,w_1,w_2) + + sub_4(t1,y_1,x_1) + sub_4(t2,y_2,x_2) + add_4(t3,y_1,x_1) + add_4(t4,y_2,x_2) + double_4(t5,z_2) + + mul_4(t1,t1,t2) + mul_4(t3,t3,t4) + + load_k25519(t2) + mul_4(t2,t2,t0) + + mul_4(t4,z_1,t5) + + sub_twice4(t0,t3,t1) + add_twice4(t5,t3,t1) + sub_twice4(t1,t4,t2) + add_twice4(t3,t4,t2) + + mul_p25519(w_3,t0,t5) + mul_p25519(x_3,t0,t1) + mul_p25519(y_3,t3,t5) + mul_p25519(z_3,t1,t3) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epdouble.S new file mode 100644 index 00000000000..4472d99a19f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epdouble.S @@ -0,0 +1,375 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended projective doubling for edwards25519 +// Input p1[12]; output p3[16] +// +// extern void edwards25519_epdouble +// (uint64_t p3[static 16],uint64_t p1[static 12]) +// +// If p1 is a point on edwards25519, returns its double p3 = 2 * p1. +// The output p3 is in extended projective coordinates, representing +// affine (x,y) by a quadruple (X,Y,Z,T) where x = X / Z, y = Y / Z +// and x * y = T / Z. The input p1 may also be in the same extended +// projective representation, but the final T field is not used so +// a more basic projective triple (X,Y,Z) suffices. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epdouble) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Registers used for inputs and outputs within basic operations. +// Here p1 and p3 are where the parameters come in anyway. + +#define p3 %rdi +#define p1 %rsi + +// Pointers to input and output coordinates + +#define x_1 0(p1) +#define y_1 NUMSIZE(p1) +#define z_1 (2*NUMSIZE)(p1) + +#define x_3 0(p3) +#define y_3 NUMSIZE(p3) +#define z_3 (2*NUMSIZE)(p3) +#define w_3 (3*NUMSIZE)(p3) + +// Pointer-offset pairs for temporaries on stack + +#define t0 (0*NUMSIZE)(%rsp) +#define t1 (1*NUMSIZE)(%rsp) +#define t2 (2*NUMSIZE)(%rsp) +#define t3 (3*NUMSIZE)(%rsp) +#define t4 (4*NUMSIZE)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (5*NUMSIZE) + +// Macro wrapping up the basic field multiplication, only trivially +// different from a pure function call to bignum_mul_p25519. + +#define mul_p25519(P0,P1,P2) \ + xorl %esi, %esi ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rsi, %r12 ; \ + xorl %esi, %esi ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rsi, %r13 ; \ + adcxq %rsi, %r13 ; \ + xorl %esi, %esi ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rsi, %r14 ; \ + adcxq %rsi, %r14 ; \ + xorl %esi, %esi ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rsi, %r15 ; \ + adcxq %rsi, %r15 ; \ + movl $0x26, %edx ; \ + xorl %esi, %esi ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rsi, %r12 ; \ + adcxq %rsi, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + movl $0x13, %edx ; \ + incq %r12; \ + bts $63, %r11 ; \ + mulxq %r12, %rax, %rbx ; \ + addq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rsi, %r10 ; \ + adcq %rsi, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rdx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rsi, %r9 ; \ + sbbq %rsi, %r10 ; \ + sbbq %rsi, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ebx, %ebx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rbx, %r13 ; \ + adoxq %rbx, %r14 ; \ + adcq %rbx, %r14 ; \ + xorl %ebx, %ebx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rbx, %r15 ; \ + adoxq %rbx, %r15 ; \ + movl $0x26, %edx ; \ + xorl %ebx, %ebx ; \ + mulxq %r12, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq %r13, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq %r14, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + adcxq %rbx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Plain 4-digit add without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq 8+P2, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq 16+P2, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq 24+P2, %rax ; \ + movq %rax, 24+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq 0x18+P2, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +#define double_twice4(P0,P1) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq %r8, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq %r9, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq %r10, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq %r11, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +S2N_BN_SYMBOL(edwards25519_epdouble): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main sequence + + add_4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_p25519(y_3,t2,t4) + mul_p25519(z_3,t3,t2) + mul_p25519(w_3,t1,t4) + mul_p25519(x_3,t1,t3) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epdouble_alt.S new file mode 100644 index 00000000000..a335149e0e6 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_epdouble_alt.S @@ -0,0 +1,454 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended projective doubling for edwards25519 +// Input p1[12]; output p3[16] +// +// extern void edwards25519_epdouble +// (uint64_t p3[static 16],uint64_t p1[static 12]) +// +// If p1 is a point on edwards25519, returns its double p3 = 2 * p1. +// The output p3 is in extended projective coordinates, representing +// affine (x,y) by a quadruple (X,Y,Z,T) where x = X / Z, y = Y / Z +// and x * y = T / Z. The input p1 may also be in the same extended +// projective representation, but the final T field is not used so +// a more basic projective triple (X,Y,Z) suffices. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_epdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_epdouble_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Registers used for inputs and outputs within basic operations. +// Here p1 and p3 are where the parameters come in anyway. + +#define p3 %rdi +#define p1 %rsi + +// Pointers to input and output coordinates + +#define x_1 0(p1) +#define y_1 NUMSIZE(p1) +#define z_1 (2*NUMSIZE)(p1) + +#define x_3 0(p3) +#define y_3 NUMSIZE(p3) +#define z_3 (2*NUMSIZE)(p3) +#define w_3 (3*NUMSIZE)(p3) + +// Pointer-offset pairs for temporaries on stack + +#define t0 (0*NUMSIZE)(%rsp) +#define t1 (1*NUMSIZE)(%rsp) +#define t2 (2*NUMSIZE)(%rsp) +#define t3 (3*NUMSIZE)(%rsp) +#define t4 (4*NUMSIZE)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (5*NUMSIZE) + +// Macro wrapping up the basic field multiplication, only trivially +// different from a pure function call to bignum_mul_p25519_alt. + +#define mul_p25519(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movl $0x13, %esi ; \ + bts $63, %r11 ; \ + imulq %rsi, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rsi, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r14 ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r15 ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %ebx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Plain 4-digit add without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq 8+P2, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq 16+P2, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq 24+P2, %rax ; \ + movq %rax, 24+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq 0x18+P2, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +#define double_twice4(P0,P1) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq %r8, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq %r9, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq %r10, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq %r11, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +S2N_BN_SYMBOL(edwards25519_epdouble_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main sequence + + add_4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_p25519(y_3,t2,t4) + mul_p25519(z_3,t3,t2) + mul_p25519(w_3,t1,t4) + mul_p25519(x_3,t1,t3) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pdouble.S new file mode 100644 index 00000000000..093d790289b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pdouble.S @@ -0,0 +1,370 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Projective doubling for edwards25519 +// Input p1[12]; output p3[12] +// +// extern void edwards25519_pdouble +// (uint64_t p3[static 12],uint64_t p1[static 12]) +// +// If p1 is a point on edwards25519, returns its double p3 = 2 * p1. +// Input and output are in pure projective coordinates, representing +// an affine (x,y) by a triple (X,Y,Z) where x = X / Z, y = Y / Z. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pdouble) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Registers used for inputs and outputs within basic operations. +// Here p1 and p3 are where the parameters come in anyway. + +#define p3 %rdi +#define p1 %rsi + +// Pointers to input and output coordinates + +#define x_1 0(p1) +#define y_1 NUMSIZE(p1) +#define z_1 (2*NUMSIZE)(p1) + +#define x_3 0(p3) +#define y_3 NUMSIZE(p3) +#define z_3 (2*NUMSIZE)(p3) + +// Pointer-offset pairs for temporaries on stack + +#define t0 (0*NUMSIZE)(%rsp) +#define t1 (1*NUMSIZE)(%rsp) +#define t2 (2*NUMSIZE)(%rsp) +#define t3 (3*NUMSIZE)(%rsp) +#define t4 (4*NUMSIZE)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (5*NUMSIZE) + +// Macro wrapping up the basic field multiplication, only trivially +// different from a pure function call to bignum_mul_p25519. + +#define mul_p25519(P0,P1,P2) \ + xorl %esi, %esi ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rsi, %r12 ; \ + xorl %esi, %esi ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rsi, %r13 ; \ + adcxq %rsi, %r13 ; \ + xorl %esi, %esi ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rsi, %r14 ; \ + adcxq %rsi, %r14 ; \ + xorl %esi, %esi ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rsi, %r15 ; \ + adcxq %rsi, %r15 ; \ + movl $0x26, %edx ; \ + xorl %esi, %esi ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rsi, %r12 ; \ + adcxq %rsi, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + movl $0x13, %edx ; \ + incq %r12; \ + bts $63, %r11 ; \ + mulxq %r12, %rax, %rbx ; \ + addq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rsi, %r10 ; \ + adcq %rsi, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rdx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rsi, %r9 ; \ + sbbq %rsi, %r10 ; \ + sbbq %rsi, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ebx, %ebx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rbx, %r13 ; \ + adoxq %rbx, %r14 ; \ + adcq %rbx, %r14 ; \ + xorl %ebx, %ebx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rbx, %r15 ; \ + adoxq %rbx, %r15 ; \ + movl $0x26, %edx ; \ + xorl %ebx, %ebx ; \ + mulxq %r12, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq %r13, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq %r14, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + adcxq %rbx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Plain 4-digit add without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq 8+P2, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq 16+P2, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq 24+P2, %rax ; \ + movq %rax, 24+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq 0x18+P2, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +#define double_twice4(P0,P1) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq %r8, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq %r9, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq %r10, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq %r11, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +S2N_BN_SYMBOL(edwards25519_pdouble): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main sequence + + add_4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_p25519(y_3,t2,t4) + mul_p25519(z_3,t3,t2) + mul_p25519(x_3,t1,t3) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pdouble_alt.S new file mode 100644 index 00000000000..4d122cd6284 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pdouble_alt.S @@ -0,0 +1,449 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Projective doubling for edwards25519 +// Input p1[12]; output p3[12] +// +// extern void edwards25519_pdouble +// (uint64_t p3[static 12],uint64_t p1[static 12]) +// +// If p1 is a point on edwards25519, returns its double p3 = 2 * p1. +// Input and output are in pure projective coordinates, representing +// an affine (x,y) by a triple (X,Y,Z) where x = X / Z, y = Y / Z. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pdouble_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Registers used for inputs and outputs within basic operations. +// Here p1 and p3 are where the parameters come in anyway. + +#define p3 %rdi +#define p1 %rsi + +// Pointers to input and output coordinates + +#define x_1 0(p1) +#define y_1 NUMSIZE(p1) +#define z_1 (2*NUMSIZE)(p1) + +#define x_3 0(p3) +#define y_3 NUMSIZE(p3) +#define z_3 (2*NUMSIZE)(p3) + +// Pointer-offset pairs for temporaries on stack + +#define t0 (0*NUMSIZE)(%rsp) +#define t1 (1*NUMSIZE)(%rsp) +#define t2 (2*NUMSIZE)(%rsp) +#define t3 (3*NUMSIZE)(%rsp) +#define t4 (4*NUMSIZE)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (5*NUMSIZE) + +// Macro wrapping up the basic field multiplication, only trivially +// different from a pure function call to bignum_mul_p25519_alt. + +#define mul_p25519(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movl $0x13, %esi ; \ + bts $63, %r11 ; \ + imulq %rsi, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rsi, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Squaring just giving a result < 2 * p_25519, which is done by +// basically skipping the +1 in the quotient estimate and the final +// optional correction. + +#define sqr_4(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r14 ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r15 ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %ebx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Plain 4-digit add without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq 8+P2, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq 16+P2, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq 24+P2, %rax ; \ + movq %rax, 24+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// Modular addition and doubling with double modulus 2 * p_25519 = 2^256 - 38. +// This only ensures that the result fits in 4 digits, not that it is reduced +// even w.r.t. double modulus. The result is always correct modulo provided +// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided +// at least one of them is reduced double modulo. + +#define add_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq 0x18+P2, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +#define double_twice4(P0,P1) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq %r8, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq %r9, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq %r10, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq %r11, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +S2N_BN_SYMBOL(edwards25519_pdouble_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main sequence + + add_4(t0,x_1,y_1) + sqr_4(t1,z_1) + sqr_4(t2,x_1) + sqr_4(t3,y_1) + double_twice4(t1,t1) + sqr_4(t0,t0) + add_twice4(t4,t2,t3) + sub_twice4(t2,t2,t3) + add_twice4(t3,t1,t2) + sub_twice4(t1,t4,t0) + mul_p25519(y_3,t2,t4) + mul_p25519(z_3,t3,t2) + mul_p25519(x_3,t1,t3) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pepadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pepadd.S new file mode 100644 index 00000000000..1d68e2add39 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pepadd.S @@ -0,0 +1,419 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended projective + precomputed mixed addition for edwards25519 +// Inputs p1[16], p2[12]; output p3[16] +// +// extern void edwards25519_pepadd +// (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12]) +// +// The output p3 and the first input p1 are points (x,y) on edwards25519 +// represented in extended projective quadruples (X,Y,Z,T) where +// x = X / Z, y = Y / Z and x * y = T / Z. The second input p2 is a triple +// encoding its point (x,y) as (y - x,y + x,2 * d * x * y) where d is the +// usual Edwards curve parameter for edwards25519. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pepadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pepadd) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Registers used for inputs and outputs within basic operations. +// Here p1 and p3 are where the parameters come in anyway; +// the p2 = %rbp assignment is set up at the beginning. + +#define p3 %rdi +#define p1 %rsi +#define p2 %rbp + +// Pointers to input and output coordinates + +#define x_1 0(p1) +#define y_1 NUMSIZE(p1) +#define z_1 (2*NUMSIZE)(p1) +#define w_1 (3*NUMSIZE)(p1) + +#define ymx_2 0(p2) +#define xpy_2 NUMSIZE(p2) +#define kxy_2 (2*NUMSIZE)(p2) + +#define x_3 0(p3) +#define y_3 NUMSIZE(p3) +#define z_3 (2*NUMSIZE)(p3) +#define w_3 (3*NUMSIZE)(p3) + +// Pointer-offset pairs for temporaries on stack + +#define t0 (0*NUMSIZE)(%rsp) +#define t1 (1*NUMSIZE)(%rsp) +#define t2 (2*NUMSIZE)(%rsp) +#define t3 (3*NUMSIZE)(%rsp) +#define t4 (4*NUMSIZE)(%rsp) +#define t5 (5*NUMSIZE)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (6*NUMSIZE) + +// Macro wrapping up the basic field multiplication, only trivially +// different from a pure function call to bignum_mul_p25519. + +#define mul_p25519(P0,P1,P2) \ + xorl %esi, %esi ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rsi, %r12 ; \ + xorl %esi, %esi ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rsi, %r13 ; \ + adcxq %rsi, %r13 ; \ + xorl %esi, %esi ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rsi, %r14 ; \ + adcxq %rsi, %r14 ; \ + xorl %esi, %esi ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rsi, %r15 ; \ + adcxq %rsi, %r15 ; \ + movl $0x26, %edx ; \ + xorl %esi, %esi ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rsi, %r12 ; \ + adcxq %rsi, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + movl $0x13, %edx ; \ + incq %r12; \ + bts $63, %r11 ; \ + mulxq %r12, %rax, %rbx ; \ + addq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rsi, %r10 ; \ + adcq %rsi, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rdx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rsi, %r9 ; \ + sbbq %rsi, %r10 ; \ + sbbq %rsi, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + xorl %ecx, %ecx ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rcx, %r13 ; \ + xorl %ecx, %ecx ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcxq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcxq %rcx, %r15 ; \ + movl $0x26, %edx ; \ + xorl %ecx, %ecx ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + adcxq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Plain 4-digit add and doubling without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq 8+P2, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq 16+P2, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq 24+P2, %rax ; \ + movq %rax, 24+P0 + +#define double_4(P0,P1) \ + movq P1, %rax ; \ + addq %rax, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq %rax, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq %rax, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq %rax, %rax ; \ + movq %rax, 24+P0 + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(P0,P1,P2) \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + subq $19, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq $0, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq $0, %rax ; \ + btc $63, %rax ; \ + movq %rax, 24+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38 +// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519. + +#define add_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq 0x18+P2, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +S2N_BN_SYMBOL(edwards25519_pepadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main sequence of operations. after setting up p2 in its register + + movq %rdx, p2 + + double_4(t0,z_1); + + sub_4(t1,y_1,x_1); + add_4(t2,y_1,x_1); + + mul_4(t3,w_1,kxy_2); + + mul_4(t1,t1,ymx_2); + mul_4(t2,t2,xpy_2); + + sub_twice4(t4,t0,t3); + add_twice4(t0,t0,t3); + sub_twice4(t5,t2,t1); + add_twice4(t1,t2,t1); + + mul_p25519(z_3,t4,t0); + mul_p25519(x_3,t5,t4); + mul_p25519(y_3,t0,t1); + mul_p25519(w_3,t5,t1); + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pepadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pepadd_alt.S new file mode 100644 index 00000000000..46faa373be1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_pepadd_alt.S @@ -0,0 +1,495 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended projective + precomputed mixed addition for edwards25519 +// Inputs p1[16], p2[12]; output p3[16] +// +// extern void edwards25519_pepadd_alt +// (uint64_t p3[static 16],uint64_t p1[static 16],uint64_t p2[static 12]) +// +// The output p3 and the first input p1 are points (x,y) on edwards25519 +// represented in extended projective quadruples (X,Y,Z,T) where +// x = X / Z, y = Y / Z and x * y = T / Z. The second input p2 is a triple +// encoding its point (x,y) as (y - x,y + x,2 * d * x * y) where d is the +// usual Edwards curve parameter for edwards25519. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(edwards25519_pepadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(edwards25519_pepadd_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Registers used for inputs and outputs within basic operations. +// Here p1 and p3 are where the parameters come in anyway; +// the p2 = %rbp assignment is set up at the beginning. + +#define p3 %rdi +#define p1 %rsi +#define p2 %rbp + +// Pointers to input and output coordinates + +#define x_1 0(p1) +#define y_1 NUMSIZE(p1) +#define z_1 (2*NUMSIZE)(p1) +#define w_1 (3*NUMSIZE)(p1) + +#define ymx_2 0(p2) +#define xpy_2 NUMSIZE(p2) +#define kxy_2 (2*NUMSIZE)(p2) + +#define x_3 0(p3) +#define y_3 NUMSIZE(p3) +#define z_3 (2*NUMSIZE)(p3) +#define w_3 (3*NUMSIZE)(p3) + +// Pointer-offset pairs for temporaries on stack + +#define t0 (0*NUMSIZE)(%rsp) +#define t1 (1*NUMSIZE)(%rsp) +#define t2 (2*NUMSIZE)(%rsp) +#define t3 (3*NUMSIZE)(%rsp) +#define t4 (4*NUMSIZE)(%rsp) +#define t5 (5*NUMSIZE)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (6*NUMSIZE) + +// Macro wrapping up the basic field multiplication, only trivially +// different from a pure function call to bignum_mul_p25519_alt. + +#define mul_p25519(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %esi ; \ + movq %r12, %rax ; \ + mulq %rsi; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rsi; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movl $0x13, %esi ; \ + bts $63, %r11 ; \ + imulq %rsi, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rsi, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + btr $63, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// A version of multiplication that only guarantees output < 2 * p_25519. +// This basically skips the +1 and final correction in quotient estimation. + +#define mul_4(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movl $0x26, %ebx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + shldq $0x1, %r11, %r12 ; \ + btr $0x3f, %r11 ; \ + movl $0x13, %edx ; \ + imulq %r12, %rdx ; \ + addq %rdx, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Plain 4-digit add and doubling without any normalization +// With inputs < p_25519 (indeed < 2^255) it still gives a 4-digit result, +// indeed one < 2 * p_25519 for normalized inputs. + +#define add_4(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq 8+P2, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq 16+P2, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq 24+P2, %rax ; \ + movq %rax, 24+P0 + +#define double_4(P0,P1) \ + movq P1, %rax ; \ + addq %rax, %rax ; \ + movq %rax, P0 ; \ + movq 8+P1, %rax ; \ + adcq %rax, %rax ; \ + movq %rax, 8+P0 ; \ + movq 16+P1, %rax ; \ + adcq %rax, %rax ; \ + movq %rax, 16+P0 ; \ + movq 24+P1, %rax ; \ + adcq %rax, %rax ; \ + movq %rax, 24+P0 + +// Subtraction of a pair of numbers < p_25519 just sufficient +// to give a 4-digit result. It actually always does (x - z) + (2^255-19) +// which in turn is done by (x - z) - (2^255+19) discarding the 2^256 +// implicitly + +#define sub_4(P0,P1,P2) \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + subq $19, %r8 ; \ + movq %r8, P0 ; \ + sbbq $0, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq $0, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq $0, %rax ; \ + btc $63, %rax ; \ + movq %rax, 24+P0 + +// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 + +#define sub_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ebx, %ebx ; \ + subq P2, %r8 ; \ + movq 8+P1, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movl $38, %ecx ; \ + movq 16+P1, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq 24+P1, %rax ; \ + sbbq 24+P2, %rax ; \ + cmovncq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %rax ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %rax, 24+P0 + +// Modular addition with inputs double modulus 2 * p_25519 = 2^256 - 38 +// and in general only guaranteeing a 4-digit result, not even < 2 * p_25519. + +#define add_twice4(P0,P1,P2) \ + movq P1, %r8 ; \ + xorl %ecx, %ecx ; \ + addq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + adcq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + adcq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + adcq 0x18+P2, %r11 ; \ + movl $38, %eax ; \ + cmovncq %rcx, %rax ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +S2N_BN_SYMBOL(edwards25519_pepadd_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers, make room for temps, preserve input arguments. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main sequence of operations. after setting up p2 in its register + + movq %rdx, p2 + + double_4(t0,z_1); + + sub_4(t1,y_1,x_1); + add_4(t2,y_1,x_1); + + mul_4(t3,w_1,kxy_2); + + mul_4(t1,t1,ymx_2); + mul_4(t2,t2,xpy_2); + + sub_twice4(t4,t0,t3); + add_twice4(t0,t0,t3); + sub_twice4(t5,t2,t1); + add_twice4(t1,t2,t1); + + mul_p25519(z_3,t4,t0); + mul_p25519(x_3,t5,t4); + mul_p25519(y_3,t0,t1); + mul_p25519(w_3,t5,t1); + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmulbase.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmulbase.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmulbase.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmulbase.S index 6b2a80c7282..a2c8c72a617 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmulbase.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmulbase.S @@ -336,6 +336,7 @@ movq %r11, 0x18+P0 S2N_BN_SYMBOL(edwards25519_scalarmulbase): + _CET_ENDBR // In this case the Windows form literally makes a subroutine call. // This avoids hassle arising from keeping code and data together. diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmulbase_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmulbase_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmulbase_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmulbase_alt.S index 4796e721891..8ae76964779 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmulbase_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmulbase_alt.S @@ -412,6 +412,7 @@ movq %r11, 0x18+P0 S2N_BN_SYMBOL(edwards25519_scalarmulbase_alt): + _CET_ENDBR // In this case the Windows form literally makes a subroutine call. // This avoids hassle arising from keeping code and data together. diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmuldouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmuldouble.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmuldouble.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmuldouble.S index 993c420e056..6de8e992274 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmuldouble.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmuldouble.S @@ -434,6 +434,7 @@ movq %rax, 24+P0 S2N_BN_SYMBOL(edwards25519_scalarmuldouble): + _CET_ENDBR // In this case the Windows form literally makes a subroutine call. // This avoids hassle arising from keeping code and data together. diff --git a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S index e7c8f7a59dd..23c0ef8aa10 100644 --- a/third_party/s2n-bignum/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S @@ -551,6 +551,7 @@ movq %rax, 24+P0 S2N_BN_SYMBOL(edwards25519_scalarmuldouble_alt): + _CET_ENDBR // In this case the Windows form literally makes a subroutine call. // This avoids hassle arising from keeping code and data together. diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_emontredc_8n.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_emontredc_8n.S new file mode 100644 index 00000000000..adcc9b172d1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_emontredc_8n.S @@ -0,0 +1,422 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended Montgomery reduce in 8-digit blocks, results in input-output buffer +// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] +// +// extern uint64_t bignum_emontredc_8n +// (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w); +// +// Functionally equivalent to bignum_emontredc (see that file for more detail). +// But in general assumes that the input k is a multiple of 8. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = w, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = w, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc_8n) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc_8n) + .text + +// Original input parameters are here + +#define z %rsi +#define w %rcx + +// This is copied in early once we stash away k + +#define m %rdi + +// A variable z pointer + +#define zz %rbp + +// Stack-based variables + +#define carry (%rsp) +#define innercount 8(%rsp) +#define outercount 16(%rsp) +#define k8m1 24(%rsp) + +// ----------------------------------------------------------------------------- +// Standard macros as used in pure multiplier arrays +// ----------------------------------------------------------------------------- + +// mulpadd i, j adds z[i] * rdx (now assumed = m[j]) into the window at i+j + +.macro mulpadd arg1,arg2 + mulxq 8*\arg1(z), %rax, %rbx +.if ((\arg1 + \arg2) % 8 == 0) + adcxq %rax, %r8 + adoxq %rbx, %r9 +.elseif ((\arg1 + \arg2) % 8 == 1) + adcxq %rax, %r9 + adoxq %rbx, %r10 +.elseif ((\arg1 + \arg2) % 8 == 2) + adcxq %rax, %r10 + adoxq %rbx, %r11 +.elseif ((\arg1 + \arg2) % 8 == 3) + adcxq %rax, %r11 + adoxq %rbx, %r12 +.elseif ((\arg1 + \arg2) % 8 == 4) + adcxq %rax, %r12 + adoxq %rbx, %r13 +.elseif ((\arg1 + \arg2) % 8 == 5) + adcxq %rax, %r13 + adoxq %rbx, %r14 +.elseif ((\arg1 + \arg2) % 8 == 6) + adcxq %rax, %r14 + adoxq %rbx, %r15 +.elseif ((\arg1 + \arg2) % 8 == 7) + adcxq %rax, %r15 + adoxq %rbx, %r8 +.endif + +.endm + +// addrow i adds z[i] + zz[0..7] * m[j] into the window + +.macro addrow arg1 + movq 8*\arg1(m), %rdx + xorl %eax, %eax // Get a known flag state + +.if (\arg1 % 8 == 0) + adoxq 8*\arg1(zz), %r8 +.elseif (\arg1 % 8 == 1) + adoxq 8*\arg1(zz), %r9 +.elseif (\arg1 % 8 == 2) + adoxq 8*\arg1(zz), %r10 +.elseif (\arg1 % 8 == 3) + adoxq 8*\arg1(zz), %r11 +.elseif (\arg1 % 8 == 4) + adoxq 8*\arg1(zz), %r12 +.elseif (\arg1 % 8 == 5) + adoxq 8*\arg1(zz), %r13 +.elseif (\arg1 % 8 == 6) + adoxq 8*\arg1(zz), %r14 +.elseif (\arg1 % 8 == 7) + adoxq 8*\arg1(zz), %r15 +.endif + + mulpadd 0, \arg1 + +.if (\arg1 % 8 == 0) + movq %r8, 8*\arg1(zz) + movl $0, %r8d +.elseif (\arg1 % 8 == 1) + movq %r9, 8*\arg1(zz) + movl $0, %r9d +.elseif (\arg1 % 8 == 2) + movq %r10, 8*\arg1(zz) + movl $0, %r10d +.elseif (\arg1 % 8 == 3) + movq %r11, 8*\arg1(zz) + movl $0, %r11d +.elseif (\arg1 % 8 == 4) + movq %r12, 8*\arg1(zz) + movl $0, %r12d +.elseif (\arg1 % 8 == 5) + movq %r13, 8*\arg1(zz) + movl $0, %r13d +.elseif (\arg1 % 8 == 6) + movq %r14, 8*\arg1(zz) + movl $0, %r14d +.elseif (\arg1 % 8 == 7) + movq %r15, 8*\arg1(zz) + movl $0, %r15d +.endif + + mulpadd 1, \arg1 + mulpadd 2, \arg1 + mulpadd 3, \arg1 + mulpadd 4, \arg1 + mulpadd 5, \arg1 + mulpadd 6, \arg1 + mulpadd 7, \arg1 + +.if (\arg1 % 8 == 0) + adcq $0, %r8 +.elseif (\arg1 % 8 == 1) + adcq $0, %r9 +.elseif (\arg1 % 8 == 2) + adcq $0, %r10 +.elseif (\arg1 % 8 == 3) + adcq $0, %r11 +.elseif (\arg1 % 8 == 4) + adcq $0, %r12 +.elseif (\arg1 % 8 == 5) + adcq $0, %r13 +.elseif (\arg1 % 8 == 6) + adcq $0, %r14 +.elseif (\arg1 % 8 == 7) + adcq $0, %r15 +.endif + + +.endm + +// ----------------------------------------------------------------------------- +// Anti-matter versions with z and m switched, and also not writing back the z +// words, but the inverses instead, *and* also adding in the z[0..7] at the +// beginning. The aim is to use this in Montgomery where we discover z[j] +// entries as we go along. +// ----------------------------------------------------------------------------- + +.macro mulpadda arg1,arg2 + mulxq 8*\arg1(m), %rax, %rbx +.if ((\arg1 + \arg2) % 8 == 0) + adcxq %rax, %r8 + adoxq %rbx, %r9 +.elseif ((\arg1 + \arg2) % 8 == 1) + adcxq %rax, %r9 + adoxq %rbx, %r10 +.elseif ((\arg1 + \arg2) % 8 == 2) + adcxq %rax, %r10 + adoxq %rbx, %r11 +.elseif ((\arg1 + \arg2) % 8 == 3) + adcxq %rax, %r11 + adoxq %rbx, %r12 +.elseif ((\arg1 + \arg2) % 8 == 4) + adcxq %rax, %r12 + adoxq %rbx, %r13 +.elseif ((\arg1 + \arg2) % 8 == 5) + adcxq %rax, %r13 + adoxq %rbx, %r14 +.elseif ((\arg1 + \arg2) % 8 == 6) + adcxq %rax, %r14 + adoxq %rbx, %r15 +.elseif ((\arg1 + \arg2) % 8 == 7) + adcxq %rax, %r15 + adoxq %rbx, %r8 +.endif + +.endm + +.macro adurowa arg1 + movq w, %rdx // Get the word-level modular inverse + xorl %eax, %eax // Get a known flag state +.if (\arg1 % 8 == 0) + mulxq %r8, %rdx, %rax +.elseif (\arg1 % 8 == 1) + mulxq %r9, %rdx, %rax +.elseif (\arg1 % 8 == 2) + mulxq %r10, %rdx, %rax +.elseif (\arg1 % 8 == 3) + mulxq %r11, %rdx, %rax +.elseif (\arg1 % 8 == 4) + mulxq %r12, %rdx, %rax +.elseif (\arg1 % 8 == 5) + mulxq %r13, %rdx, %rax +.elseif (\arg1 % 8 == 6) + mulxq %r14, %rdx, %rax +.elseif (\arg1 % 8 == 7) + mulxq %r15, %rdx, %rax +.endif + + movq %rdx, 8*\arg1(z) // Store multiplier word + + mulpadda 0, \arg1 + + // Note that the bottom reg of the window is zero by construction + // So it's safe just to use "mulpadda 7" here + + mulpadda 1, \arg1 + mulpadda 2, \arg1 + mulpadda 3, \arg1 + mulpadda 4, \arg1 + mulpadda 5, \arg1 + mulpadda 6, \arg1 + mulpadda 7, \arg1 // window lowest = 0 beforehand by construction + +.if (\arg1 % 8 == 0) + adcq $0, %r8 +.elseif (\arg1 % 8 == 1) + adcq $0, %r9 +.elseif (\arg1 % 8 == 2) + adcq $0, %r10 +.elseif (\arg1 % 8 == 3) + adcq $0, %r11 +.elseif (\arg1 % 8 == 4) + adcq $0, %r12 +.elseif (\arg1 % 8 == 5) + adcq $0, %r13 +.elseif (\arg1 % 8 == 6) + adcq $0, %r14 +.elseif (\arg1 % 8 == 7) + adcq $0, %r15 +.endif + +.endm + +.macro adurowza + movq w, %rdx // Get the word-level modular inverse + xorl %eax, %eax // Get a known flag state + + movq (z), %r8 // %r8 = zeroth word + mulxq %r8, %rdx, %rax // Compute multiplier word + movq %rdx, (z) // Store multiplier word + movq 8(z), %r9 + + mulpadda 0, 0 + movq 16(z), %r10 + mulpadda 1, 0 + movq 24(z), %r11 + mulpadda 2, 0 + movq 32(z), %r12 + mulpadda 3, 0 + movq 40(z), %r13 + mulpadda 4, 0 + movq 48(z), %r14 + mulpadda 5, 0 + movq 56(z), %r15 + mulpadda 6, 0 + mulpadda 7, 0 // r8 = 0 beforehand by construction + adcq $0, %r8 +.endm + +// ----------------------------------------------------------------------------- +// Hybrid top, doing an 8 block specially then multiple additional 8 blocks +// ----------------------------------------------------------------------------- + +// Multiply-add: z := z + x[i...i+7] * m + +.macro addrows + + adurowza + adurowa 1 + adurowa 2 + adurowa 3 + adurowa 4 + adurowa 5 + adurowa 6 + adurowa 7 + + movq z, zz + + movq k8m1, %rax + testq %rax, %rax + jz bignum_emontredc_8n_innerend + movq %rax, innercount +bignum_emontredc_8n_innerloop: + addq $64, zz + addq $64, m + addrow 0 + addrow 1 + addrow 2 + addrow 3 + addrow 4 + addrow 5 + addrow 6 + addrow 7 + subq $64, innercount + jnz bignum_emontredc_8n_innerloop + + movq k8m1, %rax +bignum_emontredc_8n_innerend: + subq %rax, m + + movq carry, %rbx + negq %rbx + adcq %r8, 64(z,%rax,1) + adcq %r9, 72(z,%rax,1) + adcq %r10, 80(z,%rax,1) + adcq %r11, 88(z,%rax,1) + adcq %r12, 96(z,%rax,1) + adcq %r13, 104(z,%rax,1) + adcq %r14, 112(z,%rax,1) + adcq %r15, 120(z,%rax,1) + movl $0, %eax + adcq $0, %rax + movq %rax, carry +.endm + +// ----------------------------------------------------------------------------- +// Main code. +// ----------------------------------------------------------------------------- + +S2N_BN_SYMBOL(bignum_emontredc_8n): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Save more registers to play with + + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Pre-initialize the return value to 0 just in case of early exit below + + xorl %eax, %eax + +// Divide the input k by 8, and push k8m1 = (k/8 - 1)<<6 which is used as +// the scaled inner loop counter / pointer adjustment repeatedly. Also push +// k/8 itself which is here initializing the outer loop count. + + shrq $3, %rdi + jz bignum_emontredc_8n_end + + leaq -1(%rdi), %rbx + shlq $6, %rbx + pushq %rbx + pushq %rdi + +// Make space for two more variables, and set between-stages carry to 0 + + subq $16, %rsp + movq $0, carry + +// Copy m into its main home + + movq %rdx, m + +// Now just systematically add in the rows + +bignum_emontredc_8n_outerloop: + addrows + addq $64, z + subq $1, outercount + jnz bignum_emontredc_8n_outerloop + +// Pop the carry-out "p", which was stored at [%rsp], put in %rax for return + + popq %rax + +// Adjust the stack + + addq $24, %rsp + +// Reset of epilog + +bignum_emontredc_8n_end: + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_kmul_16_32.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_kmul_16_32.S new file mode 100644 index 00000000000..b0508f70168 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_kmul_16_32.S @@ -0,0 +1,508 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[16], y[16]; output z[32]; temporary buffer t[>=32] +// +// extern void bignum_kmul_16_32 +// (uint64_t z[static 32], uint64_t x[static 16], uint64_t y[static 16], +// uint64_t t[static 32]) +// +// In this x86 code the final temporary space argument t is unused, but +// it is retained in the prototype above for API consistency with ARM. +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y, RCX = t +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y, R9 = t +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_16_32) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_16_32) + .text + +// These parameters are kept where they come in + +#define z %rdi +#define x %rsi + +// This one gets moved to free up %rdx for muls + +#define y %rcx + +// Often used for zero + +#define zero %rbp +#define zeroe %ebp + +// mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j + +.macro mulpadd arg1,arg2 + mulxq 8*\arg1(x), %rax, %rbx +.if ((\arg1 + \arg2) % 8 == 0) + adcxq %rax, %r8 + adoxq %rbx, %r9 +.elseif ((\arg1 + \arg2) % 8 == 1) + adcxq %rax, %r9 + adoxq %rbx, %r10 +.elseif ((\arg1 + \arg2) % 8 == 2) + adcxq %rax, %r10 + adoxq %rbx, %r11 +.elseif ((\arg1 + \arg2) % 8 == 3) + adcxq %rax, %r11 + adoxq %rbx, %r12 +.elseif ((\arg1 + \arg2) % 8 == 4) + adcxq %rax, %r12 + adoxq %rbx, %r13 +.elseif ((\arg1 + \arg2) % 8 == 5) + adcxq %rax, %r13 + adoxq %rbx, %r14 +.elseif ((\arg1 + \arg2) % 8 == 6) + adcxq %rax, %r14 + adoxq %rbx, %r15 +.elseif ((\arg1 + \arg2) % 8 == 7) + adcxq %rax, %r15 + adoxq %rbx, %r8 +.endif + +.endm + +// mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j +// but re-creates the top word assuming nothing to add there + +.macro mulpade arg1,arg2 +.if ((\arg1 + \arg2) % 8 == 0) + mulxq 8*\arg1(x), %rax, %r9 + adcxq %rax, %r8 + adoxq zero, %r9 +.elseif ((\arg1 + \arg2) % 8 == 1) + mulxq 8*\arg1(x), %rax, %r10 + adcxq %rax, %r9 + adoxq zero, %r10 +.elseif ((\arg1 + \arg2) % 8 == 2) + mulxq 8*\arg1(x), %rax, %r11 + adcxq %rax, %r10 + adoxq zero, %r11 +.elseif ((\arg1 + \arg2) % 8 == 3) + mulxq 8*\arg1(x), %rax, %r12 + adcxq %rax, %r11 + adoxq zero, %r12 +.elseif ((\arg1 + \arg2) % 8 == 4) + mulxq 8*\arg1(x), %rax, %r13 + adcxq %rax, %r12 + adoxq zero, %r13 +.elseif ((\arg1 + \arg2) % 8 == 5) + mulxq 8*\arg1(x), %rax, %r14 + adcxq %rax, %r13 + adoxq zero, %r14 +.elseif ((\arg1 + \arg2) % 8 == 6) + mulxq 8*\arg1(x), %rax, %r15 + adcxq %rax, %r14 + adoxq zero, %r15 +.elseif ((\arg1 + \arg2) % 8 == 7) + mulxq 8*\arg1(x), %rax, %r8 + adcxq %rax, %r15 + adoxq zero, %r8 +.endif + +.endm + +// addrow i adds z[i] + x[0..7] * y[i] into the window + +.macro addrow arg1 + movq 8*\arg1(y), %rdx + xorl zeroe, zeroe + +.if (\arg1 % 8 == 0) + adoxq 8*\arg1(z), %r8 +.elseif (\arg1 % 8 == 1) + adoxq 8*\arg1(z), %r9 +.elseif (\arg1 % 8 == 2) + adoxq 8*\arg1(z), %r10 +.elseif (\arg1 % 8 == 3) + adoxq 8*\arg1(z), %r11 +.elseif (\arg1 % 8 == 4) + adoxq 8*\arg1(z), %r12 +.elseif (\arg1 % 8 == 5) + adoxq 8*\arg1(z), %r13 +.elseif (\arg1 % 8 == 6) + adoxq 8*\arg1(z), %r14 +.elseif (\arg1 % 8 == 7) + adoxq 8*\arg1(z), %r15 +.endif + + mulpadd 0, \arg1 + +.if (\arg1 % 8 == 0) + movq %r8, 8*\arg1(z) +.elseif (\arg1 % 8 == 1) + movq %r9, 8*\arg1(z) +.elseif (\arg1 % 8 == 2) + movq %r10, 8*\arg1(z) +.elseif (\arg1 % 8 == 3) + movq %r11, 8*\arg1(z) +.elseif (\arg1 % 8 == 4) + movq %r12, 8*\arg1(z) +.elseif (\arg1 % 8 == 5) + movq %r13, 8*\arg1(z) +.elseif (\arg1 % 8 == 6) + movq %r14, 8*\arg1(z) +.elseif (\arg1 % 8 == 7) + movq %r15, 8*\arg1(z) +.endif + + mulpadd 1, \arg1 + mulpadd 2, \arg1 + mulpadd 3, \arg1 + mulpadd 4, \arg1 + mulpadd 5, \arg1 + mulpadd 6, \arg1 + mulpade 7, \arg1 + +.if (\arg1 % 8 == 0) + adcq zero, %r8 +.elseif (\arg1 % 8 == 1) + adcq zero, %r9 +.elseif (\arg1 % 8 == 2) + adcq zero, %r10 +.elseif (\arg1 % 8 == 3) + adcq zero, %r11 +.elseif (\arg1 % 8 == 4) + adcq zero, %r12 +.elseif (\arg1 % 8 == 5) + adcq zero, %r13 +.elseif (\arg1 % 8 == 6) + adcq zero, %r14 +.elseif (\arg1 % 8 == 7) + adcq zero, %r15 +.endif + +.endm + +// Special zero version of addrow, setting up the window from scratch + +.macro addrowz + movq (y), %rdx + xorl zeroe, zeroe + + mulxq (x), %rax, %r9 + adcq %rax, (z) + + mulxq 8(x), %rax, %r10 + adcq %rax, %r9 + + mulxq 16(x), %rax, %r11 + adcq %rax, %r10 + + mulxq 24(x), %rax, %r12 + adcq %rax, %r11 + + mulxq 32(x), %rax, %r13 + adcq %rax, %r12 + + mulxq 40(x), %rax, %r14 + adcq %rax, %r13 + + mulxq 48(x), %rax, %r15 + adcq %rax, %r14 + + mulxq 56(x), %rax, %r8 + adcq %rax, %r15 + + adcq zero, %r8 +.endm + +// This is a variant where we add the initial z[0..7] at the outset. +// This makes the initialization process a bit less wasteful. By doing +// a block of 8 we get the same effect except that we add z[0..7] +// +// adurow i adds 2^{7*64} * z[i+7] + x[0..7] * y[i] into the window + +.macro adurow arg1 + movq 8*\arg1(y), %rdx + xorl zeroe, zeroe + + mulpadd 0, \arg1 + +.if (\arg1 % 8 == 0) + movq %r8, 8*\arg1(z) +.elseif (\arg1 % 8 == 1) + movq %r9, 8*\arg1(z) +.elseif (\arg1 % 8 == 2) + movq %r10, 8*\arg1(z) +.elseif (\arg1 % 8 == 3) + movq %r11, 8*\arg1(z) +.elseif (\arg1 % 8 == 4) + movq %r12, 8*\arg1(z) +.elseif (\arg1 % 8 == 5) + movq %r13, 8*\arg1(z) +.elseif (\arg1 % 8 == 6) + movq %r14, 8*\arg1(z) +.elseif (\arg1 % 8 == 7) + movq %r15, 8*\arg1(z) +.endif + + mulpadd 1, \arg1 + mulpadd 2, \arg1 + mulpadd 3, \arg1 + mulpadd 4, \arg1 + mulpadd 5, \arg1 + mulpadd 6, \arg1 + mulpade 7, \arg1 + +.if (\arg1 % 8 == 0) + adcq zero, %r8 +.elseif (\arg1 % 8 == 1) + adcq zero, %r9 +.elseif (\arg1 % 8 == 2) + adcq zero, %r10 +.elseif (\arg1 % 8 == 3) + adcq zero, %r11 +.elseif (\arg1 % 8 == 4) + adcq zero, %r12 +.elseif (\arg1 % 8 == 5) + adcq zero, %r13 +.elseif (\arg1 % 8 == 6) + adcq zero, %r14 +.elseif (\arg1 % 8 == 7) + adcq zero, %r15 +.endif + +.endm + +// Special "adurow 0" case to do first stage + +.macro adurowz + movq (y), %rdx + xorl zeroe, zeroe + + movq (z), %r8 + movq 8(z), %r9 + + mulpadd 0, 0 + movq %r8, (z) + + movq 16(z), %r10 + mulpadd 1, 0 + movq 24(z), %r11 + mulpadd 2, 0 + movq 32(z), %r12 + mulpadd 3, 0 + movq 40(z), %r13 + mulpadd 4, 0 + movq 48(z), %r14 + mulpadd 5, 0 + movq 56(z), %r15 + mulpadd 6, 0 + + mulxq 56(x), %rax, %r8 + adcxq %rax, %r15 + adoxq zero, %r8 + adcxq zero, %r8 +.endm + +// Multiply-add: z := z + x[0..7] * y + +.macro addrows + adurowz + adurow 1 + adurow 2 + adurow 3 + adurow 4 + adurow 5 + adurow 6 + adurow 7 + addrow 8 + addrow 9 + addrow 10 + addrow 11 + addrow 12 + addrow 13 + addrow 14 + addrow 15 + + movq %r8, 128(z) + movq %r9, 136(z) + movq %r10, 144(z) + movq %r11, 152(z) + movq %r12, 160(z) + movq %r13, 168(z) + movq %r14, 176(z) + movq %r15, 184(z) + +.endm + +// mulrow i adds x[0..7] * y[i] into the window +// just like addrow but no addition of z[i] + +.macro mulrow arg1 + movq 8*\arg1(y), %rdx + xorl zeroe, zeroe + + mulpadd 0, \arg1 + +.if (\arg1 % 8 == 0) + movq %r8, 8*\arg1(z) +.elseif (\arg1 % 8 == 1) + movq %r9, 8*\arg1(z) +.elseif (\arg1 % 8 == 2) + movq %r10, 8*\arg1(z) +.elseif (\arg1 % 8 == 3) + movq %r11, 8*\arg1(z) +.elseif (\arg1 % 8 == 4) + movq %r12, 8*\arg1(z) +.elseif (\arg1 % 8 == 5) + movq %r13, 8*\arg1(z) +.elseif (\arg1 % 8 == 6) + movq %r14, 8*\arg1(z) +.elseif (\arg1 % 8 == 7) + movq %r15, 8*\arg1(z) +.endif + + mulpadd 1, \arg1 + mulpadd 2, \arg1 + mulpadd 3, \arg1 + mulpadd 4, \arg1 + mulpadd 5, \arg1 + mulpadd 6, \arg1 + mulpade 7, \arg1 + +.if (\arg1 % 8 == 0) + adcq zero, %r8 +.elseif (\arg1 % 8 == 1) + adcq zero, %r9 +.elseif (\arg1 % 8 == 2) + adcq zero, %r10 +.elseif (\arg1 % 8 == 3) + adcq zero, %r11 +.elseif (\arg1 % 8 == 4) + adcq zero, %r12 +.elseif (\arg1 % 8 == 5) + adcq zero, %r13 +.elseif (\arg1 % 8 == 6) + adcq zero, %r14 +.elseif (\arg1 % 8 == 7) + adcq zero, %r15 +.endif + + +.endm + +// Special zero version of mulrow, setting up the window from scratch + +.macro mulrowz + movq (y), %rdx + xorl zeroe, zeroe + + mulxq (x), %rax, %r9 + movq %rax, (z) + + mulxq 8(x), %rax, %r10 + adcxq %rax, %r9 + + mulxq 16(x), %rax, %r11 + adcxq %rax, %r10 + + mulxq 24(x), %rax, %r12 + adcxq %rax, %r11 + + mulxq 32(x), %rax, %r13 + adcxq %rax, %r12 + + mulxq 40(x), %rax, %r14 + adcxq %rax, %r13 + + mulxq 48(x), %rax, %r15 + adcxq %rax, %r14 + + mulxq 56(x), %rax, %r8 + adcxq %rax, %r15 + + adcq zero, %r8 +.endm + +// Multiply-add: z := x[0..7] * y plus window + +.macro mulrows + mulrowz + mulrow 1 + mulrow 2 + mulrow 3 + mulrow 4 + mulrow 5 + mulrow 6 + mulrow 7 + + mulrow 8 + mulrow 9 + mulrow 10 + mulrow 11 + mulrow 12 + mulrow 13 + mulrow 14 + mulrow 15 + + movq %r8, 128(z) + movq %r9, 136(z) + movq %r10, 144(z) + movq %r11, 152(z) + movq %r12, 160(z) + movq %r13, 168(z) + movq %r14, 176(z) + movq %r15, 184(z) + +.endm + + +S2N_BN_SYMBOL(bignum_kmul_16_32): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Move y into its permanent home, freeing up %rdx for its special role in muls + + movq %rdx, y + +// Do the zeroth row as a pure product then the next as multiply-add + + mulrows + + addq $64, z + addq $64, x + addrows + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_kmul_32_64.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_kmul_32_64.S new file mode 100644 index 00000000000..640e9ab4733 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_kmul_32_64.S @@ -0,0 +1,1149 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[32], y[32]; output z[64]; temporary buffer t[>=96] +// +// extern void bignum_kmul_32_64 +// (uint64_t z[static 64], uint64_t x[static 32], uint64_t y[static 32], +// uint64_t t[static 96]) +// +// This is a Karatsuba-style function multiplying half-sized results +// internally and using temporary buffer t for intermediate results. The size +// of 96 is an overstatement for compatibility with the ARM version; it +// actually only uses 65 elements of t (64 + 1 for a stashed sign). +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y, RCX = t +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y, R9 = t +// ----------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_kmul_32_64) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_kmul_32_64) + .text + +#define K 16 + +#define z %rdi +#define x %rsi +#define y %rcx + +#define s %r9 + +// We re-use the y variable to point at t later on, when this seems clearer + +#define t %rcx + +S2N_BN_SYMBOL(bignum_kmul_32_64): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Save callee-saved registers and also push t onto the stack; we'll +// use this space to back up both t and later z. Then move the y variable +// into its longer-term home for the first few stages. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + pushq %rcx + movq %rdx, y + +// Multiply the low halves + + callq bignum_kmul_32_64_local_bignum_kmul_16_32 + +// Multiply the high halves + + leaq 16*K-0x40(%rdi), %rdi + leaq 8*K-0x40(%rsi), %rsi + leaq 8*K(%rcx), %rcx + callq bignum_kmul_32_64_local_bignum_kmul_16_32 + +// Establish %r8 as the t pointer and use the cell to back up z now + + movq (%rsp), %r8 + subq $16*K+0x40, %rdi + movq %rdi, (%rsp) + +// Form |x_lo - x_hi| starting at t + + movq -8*K-0x40(%rsi), %rax + subq -8*K-0x40+8*K(%rsi), %rax + movq %rax, (%r8) + .set I, 1 + .rep K-1 + movq -8*K-0x40+8*I(%rsi), %rax + sbbq -8*K-0x40+8*K+8*I(%rsi), %rax + movq %rax, 8*I(%r8) + .set I, (I+1) + .endr + + movl $0, %ebx + sbbq s, s // Maintain CF, set ZF for cmovs, record sign + + .set I, 0 + .rep K + movq 8*I(%r8), %rdx + movq %rdx, %rax + notq %rdx + cmovzq %rax, %rdx + adcxq %rbx, %rdx + movq %rdx, 8*I(%r8) + .set I, (I+1) + .endr + +// Form |y_hi - y_lo| (note opposite order) starting at t[K] + + movq -8*K+8*K(%rcx), %rax + subq -8*K(%rcx), %rax + movq %rax, 8*K(%r8) + .set I, 1 + .rep K-1 + movq -8*K+8*K+8*I(%rcx), %rax + sbbq -8*K+8*I(%rcx), %rax + movq %rax, 8*K+8*I(%r8) + .set I, (I+1) + .endr + + movl $0, %ebx + sbbq %rbp, %rbp // Maintain CF, set ZF for cmovs + + .set I, 0 + .rep K + movq 8*K+8*I(%r8), %rdx + movq %rdx, %rax + notq %rdx + cmovzq %rax, %rdx + adcxq %rbx, %rdx + movq %rdx, 8*K+8*I(%r8) + .set I, (I+1) + .endr + +// Stash the final sign with which to add things at t[4*K] + + xorq %rbp, s + movq s, 32*K(%r8) + +// Multiply the absolute differences, putting the result at t[2*K] +// This has the side-effect of putting t in the "right" register %rcx +// so after the load of z, we have both z and t pointers straight. + + movq %r8, %rcx + leaq 8*K(%r8), %rsi + leaq 16*K(%r8), %rdi + callq bignum_kmul_32_64_local_bignum_kmul_16_32 + movq (%rsp), z + +// Compose the middle parts [2,1] + [1,0] + [3,2], saving carry in %rbx. +// Put the sum at t, overwriting the absolute differences we no longer need. + + xorl %ebx, %ebx + .set I, 0 + .rep 2*K + movq 8*K+8*I(z), %rax + adcxq 8*I(z), %rax + adoxq 16*K+8*I(z), %rax + movq %rax, 8*I(t) + .set I, (I+1) + .endr + adoxq %rbx, %rbx + adcq $0, %rbx + +// Sign-aware addition or subtraction of the complicated term. +// We double-negate it to set CF/ZF while not spoiling its +// actual form: note that we eventually adcx to it below. + + movq 32*K(t), s + negq s + negq s + + .set I, 0 + .rep 2*K + movq 16*K+8*I(t), %rdx + movq %rdx, %rax + notq %rdx + cmovzq %rax, %rdx + adcxq 8*I(t), %rdx + movq %rdx, 8*K+8*I(z) + .set I, (I+1) + .endr + +// Bump the accumulated carry. This must end up >= 0 because it's the top +// word of a value of the form ... + h * h' + l * l' - (h - l) * (h' - l') >= 0 + + adcxq s, %rbx + +// Finally propagate the carry to the top part + + xorl %eax, %eax + addq %rbx, 24*K(z) + .set I, 1 + .rep K-1 + adcq %rax, 24*K+8*I(z) + .set I, (I+1) + .endr + +// Restore and return. The first pop is not needed for the ABI but +// we need to adjust the stack anyway so it seems reasonable. + + popq %rcx + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +// Local copy of half-length subroutine. This has a slightly different +// interface, expecting y argument in %rcx directly, and not doing any +// save-restore of the other registers. It naturally moves z and x on by +// 0x40, which we compensate for when it is called by adjusting offsets. + +bignum_kmul_32_64_local_bignum_kmul_16_32: + movq (%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %r9 + movq %rax, (%rdi) + mulxq 0x8(%rsi), %rax, %r10 + adcxq %rax, %r9 + mulxq 0x10(%rsi), %rax, %r11 + adcxq %rax, %r10 + mulxq 0x18(%rsi), %rax, %r12 + adcxq %rax, %r11 + mulxq 0x20(%rsi), %rax, %r13 + adcxq %rax, %r12 + mulxq 0x28(%rsi), %rax, %r14 + adcxq %rax, %r13 + mulxq 0x30(%rsi), %rax, %r15 + adcxq %rax, %r14 + mulxq 0x38(%rsi), %rax, %r8 + adcxq %rax, %r15 + adcq %rbp, %r8 + movq 0x8(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + movq %r9, 0x8(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x38(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %rbp, %r9 + adcq %rbp, %r9 + movq 0x10(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq %r10, 0x10(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x38(%rsi), %rax, %r10 + adcxq %rax, %r9 + adoxq %rbp, %r10 + adcq %rbp, %r10 + movq 0x18(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq %r11, 0x18(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x38(%rsi), %rax, %r11 + adcxq %rax, %r10 + adoxq %rbp, %r11 + adcq %rbp, %r11 + movq 0x20(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq %r12, 0x20(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x38(%rsi), %rax, %r12 + adcxq %rax, %r11 + adoxq %rbp, %r12 + adcq %rbp, %r12 + movq 0x28(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movq %r13, 0x28(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x38(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + adcq %rbp, %r13 + movq 0x30(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + movq %r14, 0x30(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x38(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + movq 0x38(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + movq %r15, 0x38(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x38(%rsi), %rax, %r15 + adcxq %rax, %r14 + adoxq %rbp, %r15 + adcq %rbp, %r15 + movq 0x40(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + movq %r8, 0x40(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x38(%rsi), %rax, %r8 + adcxq %rax, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movq 0x48(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + movq %r9, 0x48(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x38(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %rbp, %r9 + adcq %rbp, %r9 + movq 0x50(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq %r10, 0x50(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x38(%rsi), %rax, %r10 + adcxq %rax, %r9 + adoxq %rbp, %r10 + adcq %rbp, %r10 + movq 0x58(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq %r11, 0x58(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x38(%rsi), %rax, %r11 + adcxq %rax, %r10 + adoxq %rbp, %r11 + adcq %rbp, %r11 + movq 0x60(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq %r12, 0x60(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x38(%rsi), %rax, %r12 + adcxq %rax, %r11 + adoxq %rbp, %r12 + adcq %rbp, %r12 + movq 0x68(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movq %r13, 0x68(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x38(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + adcq %rbp, %r13 + movq 0x70(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + movq %r14, 0x70(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x38(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + movq 0x78(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + movq %r15, 0x78(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x38(%rsi), %rax, %r15 + adcxq %rax, %r14 + adoxq %rbp, %r15 + adcq %rbp, %r15 + movq %r8, 0x80(%rdi) + movq %r9, 0x88(%rdi) + movq %r10, 0x90(%rdi) + movq %r11, 0x98(%rdi) + movq %r12, 0xa0(%rdi) + movq %r13, 0xa8(%rdi) + movq %r14, 0xb0(%rdi) + movq %r15, 0xb8(%rdi) + addq $0x40, %rdi + addq $0x40, %rsi + movq (%rcx), %rdx + xorl %ebp, %ebp + movq (%rdi), %r8 + movq 0x8(%rdi), %r9 + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + movq %r8, (%rdi) + movq 0x10(%rdi), %r10 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + movq 0x18(%rdi), %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq 0x20(%rdi), %r12 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x28(%rdi), %r13 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq 0x30(%rdi), %r14 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movq 0x38(%rdi), %r15 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x38(%rsi), %rax, %r8 + adcxq %rax, %r15 + adoxq %rbp, %r8 + adcxq %rbp, %r8 + movq 0x8(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + movq %r9, 0x8(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x38(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %rbp, %r9 + adcq %rbp, %r9 + movq 0x10(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq %r10, 0x10(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x38(%rsi), %rax, %r10 + adcxq %rax, %r9 + adoxq %rbp, %r10 + adcq %rbp, %r10 + movq 0x18(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq %r11, 0x18(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x38(%rsi), %rax, %r11 + adcxq %rax, %r10 + adoxq %rbp, %r11 + adcq %rbp, %r11 + movq 0x20(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq %r12, 0x20(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x38(%rsi), %rax, %r12 + adcxq %rax, %r11 + adoxq %rbp, %r12 + adcq %rbp, %r12 + movq 0x28(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movq %r13, 0x28(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x38(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + adcq %rbp, %r13 + movq 0x30(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + movq %r14, 0x30(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x38(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + movq 0x38(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + movq %r15, 0x38(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x38(%rsi), %rax, %r15 + adcxq %rax, %r14 + adoxq %rbp, %r15 + adcq %rbp, %r15 + movq 0x40(%rcx), %rdx + xorl %ebp, %ebp + adoxq 0x40(%rdi), %r8 + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + movq %r8, 0x40(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x38(%rsi), %rax, %r8 + adcxq %rax, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movq 0x48(%rcx), %rdx + xorl %ebp, %ebp + adoxq 0x48(%rdi), %r9 + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + movq %r9, 0x48(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x38(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %rbp, %r9 + adcq %rbp, %r9 + movq 0x50(%rcx), %rdx + xorl %ebp, %ebp + adoxq 0x50(%rdi), %r10 + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq %r10, 0x50(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x38(%rsi), %rax, %r10 + adcxq %rax, %r9 + adoxq %rbp, %r10 + adcq %rbp, %r10 + movq 0x58(%rcx), %rdx + xorl %ebp, %ebp + adoxq 0x58(%rdi), %r11 + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq %r11, 0x58(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x38(%rsi), %rax, %r11 + adcxq %rax, %r10 + adoxq %rbp, %r11 + adcq %rbp, %r11 + movq 0x60(%rcx), %rdx + xorl %ebp, %ebp + adoxq 0x60(%rdi), %r12 + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq %r12, 0x60(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x38(%rsi), %rax, %r12 + adcxq %rax, %r11 + adoxq %rbp, %r12 + adcq %rbp, %r12 + movq 0x68(%rcx), %rdx + xorl %ebp, %ebp + adoxq 0x68(%rdi), %r13 + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movq %r13, 0x68(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x38(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + adcq %rbp, %r13 + movq 0x70(%rcx), %rdx + xorl %ebp, %ebp + adoxq 0x70(%rdi), %r14 + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + movq %r14, 0x70(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x38(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + movq 0x78(%rcx), %rdx + xorl %ebp, %ebp + adoxq 0x78(%rdi), %r15 + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + movq %r15, 0x78(%rdi) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x38(%rsi), %rax, %r15 + adcxq %rax, %r14 + adoxq %rbp, %r15 + adcq %rbp, %r15 + movq %r8, 0x80(%rdi) + movq %r9, 0x88(%rdi) + movq %r10, 0x90(%rdi) + movq %r11, 0x98(%rdi) + movq %r12, 0xa0(%rdi) + movq %r13, 0xa8(%rdi) + movq %r14, 0xb0(%rdi) + movq %r15, 0xb8(%rdi) + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_ksqr_16_32.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_ksqr_16_32.S new file mode 100644 index 00000000000..86e853d2cb6 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_ksqr_16_32.S @@ -0,0 +1,540 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[16]; output z[32]; temporary buffer t[>=24] +// +// extern void bignum_ksqr_16_32 +// (uint64_t z[static 32], uint64_t x[static 16], uint64_t t[static 24]); +// +// In this x86 code the final temporary space argument t is unused, but +// it is retained in the prototype above for API consistency with ARM. +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = t +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = t +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ksqr_16_32) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ksqr_16_32) + .text + +#define z %rdi +#define x %rsi + +// A zero register + +#define zero %rbp +#define zeroe %ebp + +// ------------------------------------------------------------------------ +// mulpadd i, j adds rdx * x[i] into the window at the i+j point +// ------------------------------------------------------------------------ + +.macro mulpadd arg1,arg2 + mulxq 8*\arg1(x), %rax, %rcx +.if ((\arg1 + \arg2) % 8 == 0) + adcxq %rax, %r8 + adoxq %rcx, %r9 +.elseif ((\arg1 + \arg2) % 8 == 1) + adcxq %rax, %r9 + adoxq %rcx, %r10 +.elseif ((\arg1 + \arg2) % 8 == 2) + adcxq %rax, %r10 + adoxq %rcx, %r11 +.elseif ((\arg1 + \arg2) % 8 == 3) + adcxq %rax, %r11 + adoxq %rcx, %r12 +.elseif ((\arg1 + \arg2) % 8 == 4) + adcxq %rax, %r12 + adoxq %rcx, %r13 +.elseif ((\arg1 + \arg2) % 8 == 5) + adcxq %rax, %r13 + adoxq %rcx, %r14 +.elseif ((\arg1 + \arg2) % 8 == 6) + adcxq %rax, %r14 + adoxq %rcx, %r15 +.elseif ((\arg1 + \arg2) % 8 == 7) + adcxq %rax, %r15 + adoxq %rcx, %r8 +.endif + +.endm + +// ------------------------------------------------------------------------ +// mulpade i, j adds rdx * x[i] into the window at i+j +// but re-creates the top word assuming nothing to add there +// ------------------------------------------------------------------------ + +.macro mulpade arg1,arg2 +.if ((\arg1 + \arg2) % 8 == 0) + mulxq 8*\arg1(x), %rax, %r9 + adcxq %rax, %r8 + adoxq zero, %r9 +.elseif ((\arg1 + \arg2) % 8 == 1) + mulxq 8*\arg1(x), %rax, %r10 + adcxq %rax, %r9 + adoxq zero, %r10 +.elseif ((\arg1 + \arg2) % 8 == 2) + mulxq 8*\arg1(x), %rax, %r11 + adcxq %rax, %r10 + adoxq zero, %r11 +.elseif ((\arg1 + \arg2) % 8 == 3) + mulxq 8*\arg1(x), %rax, %r12 + adcxq %rax, %r11 + adoxq zero, %r12 +.elseif ((\arg1 + \arg2) % 8 == 4) + mulxq 8*\arg1(x), %rax, %r13 + adcxq %rax, %r12 + adoxq zero, %r13 +.elseif ((\arg1 + \arg2) % 8 == 5) + mulxq 8*\arg1(x), %rax, %r14 + adcxq %rax, %r13 + adoxq zero, %r14 +.elseif ((\arg1 + \arg2) % 8 == 6) + mulxq 8*\arg1(x), %rax, %r15 + adcxq %rax, %r14 + adoxq zero, %r15 +.elseif ((\arg1 + \arg2) % 8 == 7) + mulxq 8*\arg1(x), %rax, %r8 + adcxq %rax, %r15 + adoxq zero, %r8 +.endif + +.endm + +// ------------------------------------------------------------------------ +// addrow i,j adds z[i+j] + x[i..i+7] * x[j] into the window +// ------------------------------------------------------------------------ + +.macro addrow arg1,arg2 + movq 8*\arg2(x), %rdx + xorl zeroe, zeroe // Get a known flag state and give a zero reg + +.if ((\arg1 + \arg2) % 8 == 0) + adoxq 8*(\arg1+\arg2)(z), %r8 +.elseif ((\arg1 + \arg2) % 8 == 1) + adoxq 8*(\arg1+\arg2)(z), %r9 +.elseif ((\arg1 + \arg2) % 8 == 2) + adoxq 8*(\arg1+\arg2)(z), %r10 +.elseif ((\arg1 + \arg2) % 8 == 3) + adoxq 8*(\arg1+\arg2)(z), %r11 +.elseif ((\arg1 + \arg2) % 8 == 4) + adoxq 8*(\arg1+\arg2)(z), %r12 +.elseif ((\arg1 + \arg2) % 8 == 5) + adoxq 8*(\arg1+\arg2)(z), %r13 +.elseif ((\arg1 + \arg2) % 8 == 6) + adoxq 8*(\arg1+\arg2)(z), %r14 +.elseif ((\arg1 + \arg2) % 8 == 7) + adoxq 8*(\arg1+\arg2)(z), %r15 +.endif + + mulpadd \arg1, \arg2 + +.if ((\arg1 + \arg2) % 8 == 0) + movq %r8, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 1) + movq %r9, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 2) + movq %r10, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 3) + movq %r11, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 4) + movq %r12, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 5) + movq %r13, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 6) + movq %r14, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 7) + movq %r15, 8*(\arg1+\arg2)(z) +.endif + + mulpadd (\arg1+1), \arg2 + mulpadd (\arg1+2), \arg2 + mulpadd (\arg1+3), \arg2 + mulpadd (\arg1+4), \arg2 + mulpadd (\arg1+5), \arg2 + mulpade (\arg1+6), \arg2 + mulpade (\arg1+7), \arg2 + +.if ((\arg1 + \arg2) % 8 == 0) + adcxq zero, %r8 +.elseif ((\arg1 + \arg2) % 8 == 1) + adcxq zero, %r9 +.elseif ((\arg1 + \arg2) % 8 == 2) + adcxq zero, %r10 +.elseif ((\arg1 + \arg2) % 8 == 3) + adcxq zero, %r11 +.elseif ((\arg1 + \arg2) % 8 == 4) + adcxq zero, %r12 +.elseif ((\arg1 + \arg2) % 8 == 5) + adcxq zero, %r13 +.elseif ((\arg1 + \arg2) % 8 == 6) + adcxq zero, %r14 +.elseif ((\arg1 + \arg2) % 8 == 7) + adcxq zero, %r15 +.endif + + +.endm + + +// ------------------------------------------------------------------------ +// Adds off-diagonal part of x[i..i+7]^2 into the window, writes 0..7 back +// ------------------------------------------------------------------------ + +.macro sqr arg1 + + xorl zeroe, zeroe + +// Set up the initial window + + movq 16*\arg1+8(z), %r9 + movq 16*\arg1+16(z), %r10 + movq 16*\arg1+24(z), %r11 + movq 16*\arg1+32(z), %r12 + movq 16*\arg1+40(z), %r13 + movq 16*\arg1+48(z), %r14 + movq 16*\arg1+56(z), %r15 + +// Add in the first diagonal [%r8..%r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70 + + movq 8*\arg1(x), %rdx + mulpadd (\arg1+1), (\arg1+0) + movq %r9, 16*\arg1+8(z) + mulpadd (\arg1+2), (\arg1+0) + movq %r10, 16*\arg1+16(z) + mulpadd (\arg1+3), (\arg1+0) + mulpadd (\arg1+4), (\arg1+0) + mulpadd (\arg1+5), (\arg1+0) + mulpadd (\arg1+6), (\arg1+0) + mulpade (\arg1+7), (\arg1+0) + adcxq zero, %r8 + +// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54 + + xorl zeroe, zeroe + movq 8*\arg1+8(x), %rdx + mulpadd (\arg1+2), (\arg1+1) + movq %r11, 16*\arg1+24(z) + mulpadd (\arg1+3), (\arg1+1) + movq %r12, 16*\arg1+32(z) + mulpadd (\arg1+4), (\arg1+1) + mulpadd (\arg1+5), (\arg1+1) + mulpadd (\arg1+6), (\arg1+1) + mulpade (\arg1+7), (\arg1+1) + movq 8*\arg1+32(x), %rdx + mulpade (\arg1+5), (\arg1+4) + adcxq zero, %r10 + +// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65 + + xorl zeroe, zeroe + movq 8*\arg1+16(x), %rdx + mulpadd (\arg1+3), (\arg1+2) + movq %r13, 16*\arg1+40(z) + mulpadd (\arg1+4), (\arg1+2) + movq %r14, 16*\arg1+48(z) + mulpadd (\arg1+5), (\arg1+2) + mulpadd (\arg1+6), (\arg1+2) + mulpadd (\arg1+7), (\arg1+2) + movq 8*\arg1+48(x), %rdx + mulpade (\arg1+4), (\arg1+6) + mulpade (\arg1+5), (\arg1+6) + adcxq zero, %r12 + +// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76 + + xorl zeroe, zeroe + movq 8*\arg1+24(x), %rdx + mulpadd (\arg1+4), (\arg1+3) + movq %r15, 16*\arg1+56(z) + mulpadd (\arg1+5), (\arg1+3) + mulpadd (\arg1+6), (\arg1+3) + mulpadd (\arg1+7), (\arg1+3) + movq 8*\arg1+56(x), %rdx + mulpadd (\arg1+4), (\arg1+7) + mulpade (\arg1+5), (\arg1+7) + mulpade (\arg1+6), (\arg1+7) + adcxq zero, %r14 +.endm + +// ------------------------------------------------------------------------ +// Multiply-add: z := z + x[i...i+7] * x +// ------------------------------------------------------------------------ + +.macro addrows arg1 + + sqr \arg1 + + .set I, (\arg1+8) +.rep (8-\arg1) + addrow \arg1, I + .set I, (I+1) +.endr + + movq %r8, 8*(16+\arg1)(z) + movq %r9, 8*(17+\arg1)(z) + movq %r10, 8*(18+\arg1)(z) + movq %r11, 8*(19+\arg1)(z) + movq %r12, 8*(20+\arg1)(z) + movq %r13, 8*(21+\arg1)(z) + movq %r14, 8*(22+\arg1)(z) +.endm + + +// ------------------------------------------------------------------------ +// mulrow i,j adds x[i..i+7] * x[j] into the window +// just like addrow but no addition of z[i+j] +// ------------------------------------------------------------------------ + +.macro mulrow arg1,arg2 + movq 8*\arg2(x), %rdx + xorl zeroe, zeroe // Get a known flag state and give a zero reg + + mulpadd \arg1, \arg2 + +.if ((\arg1 + \arg2) % 8 == 0) + movq %r8, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 1) + movq %r9, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 2) + movq %r10, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 3) + movq %r11, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 4) + movq %r12, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 5) + movq %r13, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 6) + movq %r14, 8*(\arg1+\arg2)(z) +.elseif ((\arg1 + \arg2) % 8 == 7) + movq %r15, 8*(\arg1+\arg2)(z) +.endif + + mulpadd (\arg1+1), \arg2 + mulpadd (\arg1+2), \arg2 + mulpadd (\arg1+3), \arg2 + mulpadd (\arg1+4), \arg2 + mulpadd (\arg1+5), \arg2 +.if ((\arg1 + \arg2) % 8 == 0) + mulpade (\arg1+6), \arg2 +.else + mulpadd (\arg1+6), \arg2 +.endif + + mulpade (\arg1+7), \arg2 + +.if ((\arg1 + \arg2) % 8 == 0) + adcxq zero, %r8 +.elseif ((\arg1 + \arg2) % 8 == 1) + adcxq zero, %r9 +.elseif ((\arg1 + \arg2) % 8 == 2) + adcxq zero, %r10 +.elseif ((\arg1 + \arg2) % 8 == 3) + adcxq zero, %r11 +.elseif ((\arg1 + \arg2) % 8 == 4) + adcxq zero, %r12 +.elseif ((\arg1 + \arg2) % 8 == 5) + adcxq zero, %r13 +.elseif ((\arg1 + \arg2) % 8 == 6) + adcxq zero, %r14 +.elseif ((\arg1 + \arg2) % 8 == 7) + adcxq zero, %r15 +.endif + + +.endm + +// ------------------------------------------------------------------------ +// Compute off-diagonal part of x[0..7]^2, write back 1..7 elements and +// set up the high part in the standard register window. DOES NOT WRITE z[0]! +// ------------------------------------------------------------------------ + +.macro sqrz + + xorl zeroe, zeroe + +// Set initial window [%r8..%r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70 + + movq (x), %rdx + mulxq 8(x), %r9, %rax + movq %r9, 8(z) + mulxq 16(x), %r10, %rcx + adcxq %rax, %r10 + movq %r10, 16(z) + mulxq 24(x), %r11, %rax + adcxq %rcx, %r11 + mulxq 32(x), %r12, %rcx + adcxq %rax, %r12 + mulxq 40(x), %r13, %rax + adcxq %rcx, %r13 + mulxq 48(x), %r14, %rcx + adcxq %rax, %r14 + mulxq 56(x), %r15, %r8 + adcxq %rcx, %r15 + adcxq zero, %r8 + +// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54 + + xorl zeroe, zeroe + movq 8(x), %rdx + mulpadd 2, 1 + movq %r11, 24(z) + mulpadd 3, 1 + movq %r12, 32(z) + mulpadd 4, 1 + mulpadd 5, 1 + mulpadd 6, 1 + mulpade 7, 1 + movq 32(x), %rdx + mulpade 5, 4 + adcxq zero, %r10 + +// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65 + + xorl zeroe, zeroe + movq 16(x), %rdx + mulpadd 3, 2 + movq %r13, 40(z) + mulpadd 4, 2 + movq %r14, 48(z) + mulpadd 5, 2 + mulpadd 6, 2 + mulpadd 7, 2 + movq 48(x), %rdx + mulpade 4, 6 + mulpade 5, 6 + adcxq zero, %r12 + +// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76 + + xorl zeroe, zeroe + movq 24(x), %rdx + mulpadd 4, 3 + movq %r15, 56(z) + mulpadd 5, 3 + mulpadd 6, 3 + mulpadd 7, 3 + movq 56(x), %rdx + mulpadd 4, 7 + mulpade 5, 7 + mulpade 6, 7 + adcxq zero, %r14 +.endm + +// ------------------------------------------------------------------------ +// Multiply-add: z := x[0...7] * x off-diagonal elements +// ------------------------------------------------------------------------ + +.macro mulrows + sqrz + + .set I, 8 +.rep 8 + mulrow 0, I + .set I, (I+1) +.endr + + movq %r8, 128(z) + movq %r9, 136(z) + movq %r10, 144(z) + movq %r11, 152(z) + movq %r12, 160(z) + movq %r13, 168(z) + movq %r14, 176(z) + movq %r15, 184(z) +.endm + +// ------------------------------------------------------------------------ +// The actual code +// ------------------------------------------------------------------------ + + + +S2N_BN_SYMBOL(bignum_ksqr_16_32): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Now just systematically add in the rows to get all off-diagonal elements + + mulrows + addrows 8 + +// Double and add the diagonal elements. Note that z[0] was never written above + + xorl zeroe, zeroe + movq (x), %rdx + mulxq %rdx, %rax, %rcx + movq %rax, (z) + + movq 8(z), %rdx + adcxq %rdx, %rdx + adoxq %rcx, %rdx + movq %rdx, 8(z) + + .set I, 1 +.rep 14 + movq 8*I(x), %rdx + mulxq %rdx, %rax, %rcx + + movq 8*(2*I)(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 8*(2*I)(z) + + movq 8*(2*I+1)(z), %rdx + adcxq %rdx, %rdx + adoxq %rcx, %rdx + movq %rdx, 8*(2*I+1)(z) + .set I, (I+1) +.endr + + movq 8*I(x), %rdx + mulxq %rdx, %rax, %rcx + + movq 8*(2*I)(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 8*(2*I)(z) + + adcxq zero, %rcx + adoxq zero, %rcx + movq %rcx, 8*(2*I+1)(z) + .set I, (I+1) + + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_ksqr_32_64.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_ksqr_32_64.S new file mode 100644 index 00000000000..00956d919e4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_ksqr_32_64.S @@ -0,0 +1,798 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[32]; output z[64]; temporary buffer t[>=72] +// +// extern void bignum_ksqr_32_64 +// (uint64_t z[static 64], uint64_t x[static 32], uint64_t t[static 72]); +// +// This is a Karatsuba-style function squaring half-sized results +// and using temporary buffer t for intermediate results. The size of 72 +// is an overstatement for compatibility with the ARM version; it actually +// only uses 65 elements of t (64 + 1 for a suspended carry). +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = t +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = t +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ksqr_32_64) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ksqr_32_64) + .text + +#define K 16 + +#define z %rdi +#define x %rsi +#define t %rcx + +S2N_BN_SYMBOL(bignum_ksqr_32_64): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save callee-preserved registers once and for all at the outset +// Later we further reshuffle the input arguments to avoid extra saves + + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Move the temp space pointer since we need %rdx for multiplications + + movq %rdx, t + +// Square the low half + + callq bignum_ksqr_32_64_local_bignum_sqr_16_32 + +// Square the high half; from here on x and z are modified + + leaq 8*K(x), x // input at x+8*K + leaq 16*K(z), z // result at z+16*K + callq bignum_ksqr_32_64_local_bignum_sqr_16_32 + +// Form |x_lo - x_hi|, stored at t + + movq -8*K(x), %rax + subq (x), %rax + movq %rax, (t) + .set I, 1 + .rep K-1 + movq -8*K+8*I(x), %rax + sbbq 8*I(x), %rax + movq %rax, 8*I(t) + .set I, (I+1) + .endr + + movl $0, %ebx + sbbq %rax, %rax // Maintain CF, set ZF for cmovs + + .set I, 0 + .rep K + movq 8*I(t), %rdx + movq %rdx, %rax + notq %rdx + adcxq %rbx, %rdx + cmovzq %rax, %rdx + movq %rdx, 8*I(t) + .set I, (I+1) + .endr + +// Compose the middle parts [2,1] + [1,0] + [3,2] +// Put the low half of this at t[K] and the top half in place at z[2*K]; a +// fully in-place version is awkward with the otherwise beneficial double +// carry chain. Stash the carry suspended from the 3k position at the end of +// the temp buffer t[4*K]. + + xorl %edx, %edx + .set I, 0 + .rep K + movq -16*K+8*K+8*I(z), %rax + adcxq -16*K+8*I(z), %rax + adoxq -16*K+16*K+8*I(z), %rax + movq %rax, 8*K+8*I(t) + .set I, (I+1) + .endr + + .rep K + movq -16*K+8*K+8*I(z), %rax + adcxq -16*K+8*I(z), %rax + adoxq -16*K+16*K+8*I(z), %rax + movq %rax, -16*K+8*K+8*I(z) + .set I, (I+1) + .endr + + adoxq %rdx, %rdx + adcq $0, %rdx + movq %rdx, 32*K(t) + +// Square the absolute difference, putting the result M at t[2*K]. +// This involves another shuffle so now t' = z_orig and x' = t_orig +// while z' points within the temp buffer to the product M itself + + movq t, x + leaq -16*K(z), t + leaq 16*K(x), z + callq bignum_ksqr_32_64_local_bignum_sqr_16_32 + +// Subtract M, pausing at the 3k position to bump down accumulated carry. +// The carry cannot go negative since it's the top word of a value +// of the form ... + h^2 + l^2 - (h - l)^2 >= 0 + + movq 8*K(x), %rax + subq (z), %rax + movq %rax, 8*K(t) + + .set I, 1 + + .rep (K-1) + movq 8*K+8*I(x), %rax + sbbq 8*I(z), %rax + movq %rax, 8*K+8*I(t) + .set I, (I+1) + .endr + + .rep K + movq 8*K+8*I(t), %rax + sbbq 8*I(z), %rax + movq %rax, 8*K+8*I(t) + .set I, (I+1) + .endr + + movq 32*K(x), %rdx + sbbq $0, %rdx + +// Finally propagate the carry to the top quarter + + xorl %eax, %eax + addq %rdx, 24*K(t) + .set I, 1 + .rep K-1 + adcq %rax, 24*K+8*I(t) + .set I, (I+1) + .endr + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +// Local copy of the half-length subroutine + +bignum_ksqr_32_64_local_bignum_sqr_16_32: + xorl %ebp, %ebp + movq (x), %rdx + mulxq 0x8(x), %r9, %rax + movq %r9, 0x8(z) + mulxq 0x10(x), %r10, %rbx + adcxq %rax, %r10 + movq %r10, 0x10(z) + mulxq 0x18(x), %r11, %rax + adcxq %rbx, %r11 + mulxq 0x20(x), %r12, %rbx + adcxq %rax, %r12 + mulxq 0x28(x), %r13, %rax + adcxq %rbx, %r13 + mulxq 0x30(x), %r14, %rbx + adcxq %rax, %r14 + mulxq 0x38(x), %r15, %r8 + adcxq %rbx, %r15 + adcxq %rbp, %r8 + xorl %ebp, %ebp + movq 0x8(x), %rdx + mulxq 0x10(x), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq %r11, 0x18(z) + mulxq 0x18(x), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq %r12, 0x20(z) + mulxq 0x20(x), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x28(x), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x30(x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x38(x), %rax, %r9 + adcxq %rax, %r8 + adoxq %rbp, %r9 + movq 0x20(x), %rdx + mulxq 0x28(x), %rax, %r10 + adcxq %rax, %r9 + adoxq %rbp, %r10 + adcxq %rbp, %r10 + xorl %ebp, %ebp + movq 0x10(x), %rdx + mulxq 0x18(x), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movq %r13, 0x28(z) + mulxq 0x20(x), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + movq %r14, 0x30(z) + mulxq 0x28(x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x30(x), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x38(x), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + movq 0x30(x), %rdx + mulxq 0x20(x), %rax, %r11 + adcxq %rax, %r10 + adoxq %rbp, %r11 + mulxq 0x28(x), %rax, %r12 + adcxq %rax, %r11 + adoxq %rbp, %r12 + adcxq %rbp, %r12 + xorl %ebp, %ebp + movq 0x18(x), %rdx + mulxq 0x20(x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + movq %r15, 0x38(z) + mulxq 0x28(x), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x30(x), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x38(x), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq 0x38(x), %rdx + mulxq 0x20(x), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x28(x), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + mulxq 0x30(x), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcxq %rbp, %r14 + movq 0x40(x), %rdx + xorl %ebp, %ebp + mulxq (x), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + movq %r8, 0x40(z) + mulxq 0x8(x), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x10(x), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x18(x), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x20(x), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x28(x), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x30(x), %rax, %r15 + adcxq %rax, %r14 + adoxq %rbp, %r15 + mulxq 0x38(x), %rax, %r8 + adcxq %rax, %r15 + adoxq %rbp, %r8 + adcxq %rbp, %r8 + movq 0x48(x), %rdx + xorl %ebp, %ebp + mulxq (x), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + movq %r9, 0x48(z) + mulxq 0x8(x), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(x), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(x), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x20(x), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x28(x), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x30(x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x38(x), %rax, %r9 + adcxq %rax, %r8 + adoxq %rbp, %r9 + adcxq %rbp, %r9 + movq 0x50(x), %rdx + xorl %ebp, %ebp + mulxq (x), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq %r10, 0x50(z) + mulxq 0x8(x), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(x), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(x), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x20(x), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x28(x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x30(x), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x38(x), %rax, %r10 + adcxq %rax, %r9 + adoxq %rbp, %r10 + adcxq %rbp, %r10 + movq 0x58(x), %rdx + xorl %ebp, %ebp + mulxq (x), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq %r11, 0x58(z) + mulxq 0x8(x), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(x), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x18(x), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x20(x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x28(x), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x30(x), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x38(x), %rax, %r11 + adcxq %rax, %r10 + adoxq %rbp, %r11 + adcxq %rbp, %r11 + movq 0x60(x), %rdx + xorl %ebp, %ebp + mulxq (x), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq %r12, 0x60(z) + mulxq 0x8(x), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x10(x), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x18(x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x20(x), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x28(x), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x30(x), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x38(x), %rax, %r12 + adcxq %rax, %r11 + adoxq %rbp, %r12 + adcxq %rbp, %r12 + movq 0x68(x), %rdx + xorl %ebp, %ebp + mulxq (x), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movq %r13, 0x68(z) + mulxq 0x8(x), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x10(x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x18(x), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x20(x), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x28(x), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x30(x), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x38(x), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + adcxq %rbp, %r13 + movq 0x70(x), %rdx + xorl %ebp, %ebp + mulxq (x), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + movq %r14, 0x70(z) + mulxq 0x8(x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x10(x), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x18(x), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x20(x), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(x), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x30(x), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x38(x), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcxq %rbp, %r14 + movq 0x78(x), %rdx + xorl %ebp, %ebp + mulxq (x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + movq %r15, 0x78(z) + mulxq 0x8(x), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x10(x), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x18(x), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x20(x), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x28(x), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x30(x), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x38(x), %rax, %r15 + adcxq %rax, %r14 + adoxq %rbp, %r15 + adcxq %rbp, %r15 + movq %r8, 0x80(z) + movq %r9, 0x88(z) + movq %r10, 0x90(z) + movq %r11, 0x98(z) + movq %r12, 0xa0(z) + movq %r13, 0xa8(z) + movq %r14, 0xb0(z) + movq %r15, 0xb8(z) + xorl %ebp, %ebp + movq 0x88(z), %r9 + movq 0x90(z), %r10 + movq 0x98(z), %r11 + movq 0xa0(z), %r12 + movq 0xa8(z), %r13 + movq 0xb0(z), %r14 + movq 0xb8(z), %r15 + movq 0x40(x), %rdx + mulxq 0x48(x), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + movq %r9, 0x88(z) + mulxq 0x50(x), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq %r10, 0x90(z) + mulxq 0x58(x), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x60(x), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x68(x), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x70(x), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x78(x), %rax, %r8 + adcxq %rax, %r15 + adoxq %rbp, %r8 + adcxq %rbp, %r8 + xorl %ebp, %ebp + movq 0x48(x), %rdx + mulxq 0x50(x), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq %r11, 0x98(z) + mulxq 0x58(x), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq %r12, 0xa0(z) + mulxq 0x60(x), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x68(x), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x70(x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x78(x), %rax, %r9 + adcxq %rax, %r8 + adoxq %rbp, %r9 + movq 0x60(x), %rdx + mulxq 0x68(x), %rax, %r10 + adcxq %rax, %r9 + adoxq %rbp, %r10 + adcxq %rbp, %r10 + xorl %ebp, %ebp + movq 0x50(x), %rdx + mulxq 0x58(x), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movq %r13, 0xa8(z) + mulxq 0x60(x), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + movq %r14, 0xb0(z) + mulxq 0x68(x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x70(x), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x78(x), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + movq 0x70(x), %rdx + mulxq 0x60(x), %rax, %r11 + adcxq %rax, %r10 + adoxq %rbp, %r11 + mulxq 0x68(x), %rax, %r12 + adcxq %rax, %r11 + adoxq %rbp, %r12 + adcxq %rbp, %r12 + xorl %ebp, %ebp + movq 0x58(x), %rdx + mulxq 0x60(x), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + movq %r15, 0xb8(z) + mulxq 0x68(x), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x70(x), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x78(x), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq 0x78(x), %rdx + mulxq 0x60(x), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(x), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + mulxq 0x70(x), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcxq %rbp, %r14 + movq %r8, 0xc0(z) + movq %r9, 0xc8(z) + movq %r10, 0xd0(z) + movq %r11, 0xd8(z) + movq %r12, 0xe0(z) + movq %r13, 0xe8(z) + movq %r14, 0xf0(z) + xorl %ebp, %ebp + movq (x), %rdx + mulxq %rdx, %rax, %rbx + movq %rax, (z) + movq 0x8(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0x8(z) + movq 0x8(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0x10(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0x10(z) + movq 0x18(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0x18(z) + movq 0x10(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0x20(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0x20(z) + movq 0x28(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0x28(z) + movq 0x18(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0x30(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0x30(z) + movq 0x38(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0x38(z) + movq 0x20(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0x40(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0x40(z) + movq 0x48(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0x48(z) + movq 0x28(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0x50(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0x50(z) + movq 0x58(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0x58(z) + movq 0x30(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0x60(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0x60(z) + movq 0x68(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0x68(z) + movq 0x38(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0x70(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0x70(z) + movq 0x78(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0x78(z) + movq 0x40(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0x80(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0x80(z) + movq 0x88(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0x88(z) + movq 0x48(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0x90(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0x90(z) + movq 0x98(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0x98(z) + movq 0x50(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0xa0(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0xa0(z) + movq 0xa8(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0xa8(z) + movq 0x58(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0xb0(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0xb0(z) + movq 0xb8(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0xb8(z) + movq 0x60(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0xc0(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0xc0(z) + movq 0xc8(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0xc8(z) + movq 0x68(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0xd0(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0xd0(z) + movq 0xd8(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0xd8(z) + movq 0x70(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0xe0(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0xe0(z) + movq 0xe8(z), %rdx + adcxq %rdx, %rdx + adoxq %rbx, %rdx + movq %rdx, 0xe8(z) + movq 0x78(x), %rdx + mulxq %rdx, %rax, %rbx + movq 0xf0(z), %rdx + adcxq %rdx, %rdx + adoxq %rax, %rdx + movq %rdx, 0xf0(z) + adcxq %rbp, %rbx + adoxq %rbp, %rbx + movq %rbx, 0xf8(z) + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_4_8.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_4_8.S new file mode 100644 index 00000000000..1df6d6c9d88 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_4_8.S @@ -0,0 +1,174 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[4], y[4]; output z[8] +// +// extern void bignum_mul_4_8 +// (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// Copied in or set up + +#define y %rcx + +// A zero register + +#define zero %rbp +#define zeroe %ebp + +// Add in x[i] * %rdx to the (i,i+1) position with the register window +// Would be nice to have conditional expressions reg[i], reg[i+1] ... + +.macro mulpadd arg1,arg2 + mulxq 8*\arg2(x), %rax, %rbx +.if ((\arg1 + \arg2) % 4 == 0) + adcxq %rax, %r8 + adoxq %rbx, %r9 +.elseif ((\arg1 + \arg2) % 4 == 1) + adcxq %rax, %r9 + adoxq %rbx, %r10 +.elseif ((\arg1 + \arg2) % 4 == 2) + adcxq %rax, %r10 + adoxq %rbx, %r11 +.elseif ((\arg1 + \arg2) % 4 == 3) + adcxq %rax, %r11 + adoxq %rbx, %r8 +.endif + +.endm + + +// Add in the whole j'th row + +.macro addrow arg1 + movq 8*\arg1(y), %rdx + xorl zeroe, zeroe + + mulpadd \arg1, 0 + +.if (\arg1 % 4 == 0) + movq %r8, 8*\arg1(z) +.elseif (\arg1 % 4 == 1) + movq %r9, 8*\arg1(z) +.elseif (\arg1 % 4 == 2) + movq %r10, 8*\arg1(z) +.elseif (\arg1 % 4 == 3) + movq %r11, 8*\arg1(z) +.endif + + mulpadd \arg1, 1 + mulpadd \arg1, 2 + +.if (\arg1 % 4 == 0) + mulxq 24(x), %rax, %r8 + adcxq %rax, %r11 + adoxq zero, %r8 + adcxq zero, %r8 +.elseif (\arg1 % 4 == 1) + mulxq 24(x), %rax, %r9 + adcxq %rax, %r8 + adoxq zero, %r9 + adcxq zero, %r9 +.elseif (\arg1 % 4 == 2) + mulxq 24(x), %rax, %r10 + adcxq %rax, %r9 + adoxq zero, %r10 + adcxq zero, %r10 +.elseif (\arg1 % 4 == 3) + mulxq 24(x), %rax, %r11 + adcxq %rax, %r10 + adoxq zero, %r11 + adcxq zero, %r11 +.endif + +.endm + + + +S2N_BN_SYMBOL(bignum_mul_4_8): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %rbp + pushq %rbx + +// Copy y into a safe register to start with + + movq %rdx, y + +// Zero a register, which also makes sure we don't get a fake carry-in + + xorl zeroe, zeroe + +// Do the zeroth row, which is a bit different +// Write back the zero-zero product and then accumulate +// %r8,%r11,%r10,%r9 as y[0] * x from 1..4 + + movq (y), %rdx + + mulxq (x), %r8, %r9 + movq %r8, (z) + + mulxq 8(x), %rbx, %r10 + adcxq %rbx, %r9 + + mulxq 16(x), %rbx, %r11 + adcxq %rbx, %r10 + + mulxq 24(x), %rbx, %r8 + adcxq %rbx, %r11 + adcxq zero, %r8 + +// Now all the other rows in a uniform pattern + + addrow 1 + addrow 2 + addrow 3 + +// Now write back the additional columns + + movq %r8, 32(z) + movq %r9, 40(z) + movq %r10, 48(z) + movq %r11, 56(z) + +// Restore registers and return + + popq %rbx + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_4_8_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_4_8_alt.S new file mode 100644 index 00000000000..4730daa6751 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_4_8_alt.S @@ -0,0 +1,146 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[4], y[4]; output z[8] +// +// extern void bignum_mul_4_8_alt +// (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8_alt) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// This is moved from %rdx to free it for muls + +#define y %rcx + +// Other variables used as a rotating 3-word window to add terms to + +#define t0 %r8 +#define t1 %r9 +#define t2 %r10 + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A minutely shorter form for when c = 0 initially + +#define combadz(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq c, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h + +S2N_BN_SYMBOL(bignum_mul_4_8_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Copy y into a safe register to start with + + movq %rdx, y + +// Result term 0 + + movq (x), %rax + mulq (y) + + movq %rax, (z) + movq %rdx, t0 + xorq t1, t1 + +// Result term 1 + + xorq t2, t2 + combads(t1,t0,(x),8(y)) + combadz(t2,t1,t0,8(x),(y)) + movq t0, 8(z) + +// Result term 2 + + xorq t0, t0 + combadz(t0,t2,t1,(x),16(y)) + combadd(t0,t2,t1,8(x),8(y)) + combadd(t0,t2,t1,16(x),(y)) + movq t1, 16(z) + +// Result term 3 + + xorq t1, t1 + combadz(t1,t0,t2,(x),24(y)) + combadd(t1,t0,t2,8(x),16(y)) + combadd(t1,t0,t2,16(x),8(y)) + combadd(t1,t0,t2,24(x),(y)) + movq t2, 24(z) + +// Result term 4 + + xorq t2, t2 + combadz(t2,t1,t0,8(x),24(y)) + combadd(t2,t1,t0,16(x),16(y)) + combadd(t2,t1,t0,24(x),8(y)) + movq t0, 32(z) + +// Result term 5 + + xorq t0, t0 + combadz(t0,t2,t1,16(x),24(y)) + combadd(t0,t2,t1,24(x),16(y)) + movq t1, 40(z) + +// Result term 6 + + xorq t1, t1 + combads(t0,t2,24(x),24(y)) + movq t2, 48(z) + +// Result term 7 + + movq t0, 56(z) + +// Return + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_6_12.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_6_12.S new file mode 100644 index 00000000000..87dbfa09d4e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_6_12.S @@ -0,0 +1,210 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[6], y[6]; output z[12] +// +// extern void bignum_mul_6_12 +// (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// Copied in or set up + +#define y %rcx + +// A zero register + +#define zero %rbp +#define zeroe %ebp + +// Add in x[i] * %rdx to the (i,i+1) position with the register window +// Would be nice to have conditional expressions reg[i], reg[i+1] ... + +.macro mulpadd arg1,arg2 + mulxq 8*\arg2(x), %rax, %rbx +.if ((\arg1 + \arg2) % 6 == 0) + adcxq %rax, %r8 + adoxq %rbx, %r9 +.elseif ((\arg1 + \arg2) % 6 == 1) + adcxq %rax, %r9 + adoxq %rbx, %r10 +.elseif ((\arg1 + \arg2) % 6 == 2) + adcxq %rax, %r10 + adoxq %rbx, %r11 +.elseif ((\arg1 + \arg2) % 6 == 3) + adcxq %rax, %r11 + adoxq %rbx, %r12 +.elseif ((\arg1 + \arg2) % 6 == 4) + adcxq %rax, %r12 + adoxq %rbx, %r13 +.elseif ((\arg1 + \arg2) % 6 == 5) + adcxq %rax, %r13 + adoxq %rbx, %r8 +.endif + +.endm + + +// Add in the whole j'th row + +.macro addrow arg1 + movq 8*\arg1(y), %rdx + xorl zeroe, zeroe + + mulpadd \arg1, 0 + +.if (\arg1 % 6 == 0) + movq %r8, 8*\arg1(z) +.elseif (\arg1 % 6 == 1) + movq %r9, 8*\arg1(z) +.elseif (\arg1 % 6 == 2) + movq %r10, 8*\arg1(z) +.elseif (\arg1 % 6 == 3) + movq %r11, 8*\arg1(z) +.elseif (\arg1 % 6 == 4) + movq %r12, 8*\arg1(z) +.elseif (\arg1 % 6 == 5) + movq %r13, 8*\arg1(z) +.endif + + mulpadd \arg1, 1 + mulpadd \arg1, 2 + mulpadd \arg1, 3 + mulpadd \arg1, 4 + +.if (\arg1 % 6 == 0) + mulxq 40(x), %rax, %r8 + adcxq %rax, %r13 + adoxq zero, %r8 + adcxq zero, %r8 +.elseif (\arg1 % 6 == 1) + mulxq 40(x), %rax, %r9 + adcxq %rax, %r8 + adoxq zero, %r9 + adcxq zero, %r9 +.elseif (\arg1 % 6 == 2) + mulxq 40(x), %rax, %r10 + adcxq %rax, %r9 + adoxq zero, %r10 + adcxq zero, %r10 +.elseif (\arg1 % 6 == 3) + mulxq 40(x), %rax, %r11 + adcxq %rax, %r10 + adoxq zero, %r11 + adcxq zero, %r11 +.elseif (\arg1 % 6 == 4) + mulxq 40(x), %rax, %r12 + adcxq %rax, %r11 + adoxq zero, %r12 + adcxq zero, %r12 +.elseif (\arg1 % 6 == 5) + mulxq 40(x), %rax, %r13 + adcxq %rax, %r12 + adoxq zero, %r13 + adcxq zero, %r13 +.endif + +.endm + + + +S2N_BN_SYMBOL(bignum_mul_6_12): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + +// Copy y into a safe register to start with + + movq %rdx, y + +// Zero a register, which also makes sure we don't get a fake carry-in + + xorl zeroe, zeroe + +// Do the zeroth row, which is a bit different +// Write back the zero-zero product and then accumulate +// %r8,%r13,%r12,%r11,%r10,%r9 as y[0] * x from 1..6 + + movq (y), %rdx + + mulxq (x), %r8, %r9 + movq %r8, (z) + + mulxq 8(x), %rbx, %r10 + adcxq %rbx, %r9 + + mulxq 16(x), %rbx, %r11 + adcxq %rbx, %r10 + + mulxq 24(x), %rbx, %r12 + adcxq %rbx, %r11 + + mulxq 32(x), %rbx, %r13 + adcxq %rbx, %r12 + + mulxq 40(x), %rbx, %r8 + adcxq %rbx, %r13 + adcxq zero, %r8 + +// Now all the other rows in a uniform pattern + + addrow 1 + addrow 2 + addrow 3 + addrow 4 + addrow 5 + +// Now write back the additional columns + + movq %r8, 48(z) + movq %r9, 56(z) + movq %r10, 64(z) + movq %r11, 72(z) + movq %r12, 80(z) + movq %r13, 88(z) + +// Restore registers and return + + popq %r13 + popq %r12 + popq %rbx + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_6_12_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_6_12_alt.S new file mode 100644 index 00000000000..36bceceb536 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_6_12_alt.S @@ -0,0 +1,185 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[6], y[6]; output z[12] +// +// extern void bignum_mul_6_12_alt +// (uint64_t z[static 12], uint64_t x[static 6], uint64_t y[static 6]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_6_12_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_6_12_alt) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// This is moved from %rdx to free it for muls + +#define y %rcx + +// Other variables used as a rotating 3-word window to add terms to + +#define t0 %r8 +#define t1 %r9 +#define t2 %r10 + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A minutely shorter form for when c = 0 initially + +#define combadz(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq c, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h + +S2N_BN_SYMBOL(bignum_mul_6_12_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Copy y into a safe register to start with + + movq %rdx, y + +// Result term 0 + + movq (x), %rax + mulq (y) + + movq %rax, (z) + movq %rdx, t0 + xorq t1, t1 + +// Result term 1 + + xorq t2, t2 + combads(t1,t0,(x),8(y)) + combadz(t2,t1,t0,8(x),(y)) + movq t0, 8(z) + +// Result term 2 + + xorq t0, t0 + combadz(t0,t2,t1,(x),16(y)) + combadd(t0,t2,t1,8(x),8(y)) + combadd(t0,t2,t1,16(x),(y)) + movq t1, 16(z) + +// Result term 3 + + xorq t1, t1 + combadz(t1,t0,t2,(x),24(y)) + combadd(t1,t0,t2,8(x),16(y)) + combadd(t1,t0,t2,16(x),8(y)) + combadd(t1,t0,t2,24(x),(y)) + movq t2, 24(z) + +// Result term 4 + + xorq t2, t2 + combadz(t2,t1,t0,(x),32(y)) + combadd(t2,t1,t0,8(x),24(y)) + combadd(t2,t1,t0,16(x),16(y)) + combadd(t2,t1,t0,24(x),8(y)) + combadd(t2,t1,t0,32(x),(y)) + movq t0, 32(z) + +// Result term 5 + + xorq t0, t0 + combadz(t0,t2,t1,(x),40(y)) + combadd(t0,t2,t1,8(x),32(y)) + combadd(t0,t2,t1,16(x),24(y)) + combadd(t0,t2,t1,24(x),16(y)) + combadd(t0,t2,t1,32(x),8(y)) + combadd(t0,t2,t1,40(x),(y)) + movq t1, 40(z) + +// Result term 6 + + xorq t1, t1 + combadz(t1,t0,t2,8(x),40(y)) + combadd(t1,t0,t2,16(x),32(y)) + combadd(t1,t0,t2,24(x),24(y)) + combadd(t1,t0,t2,32(x),16(y)) + combadd(t1,t0,t2,40(x),8(y)) + movq t2, 48(z) + +// Result term 7 + + xorq t2, t2 + combadz(t2,t1,t0,16(x),40(y)) + combadd(t2,t1,t0,24(x),32(y)) + combadd(t2,t1,t0,32(x),24(y)) + combadd(t2,t1,t0,40(x),16(y)) + movq t0, 56(z) + +// Result term 8 + + xorq t0, t0 + combadz(t0,t2,t1,24(x),40(y)) + combadd(t0,t2,t1,32(x),32(y)) + combadd(t0,t2,t1,40(x),24(y)) + movq t1, 64(z) + +// Result term 9 + + xorq t1, t1 + combadz(t1,t0,t2,32(x),40(y)) + combadd(t1,t0,t2,40(x),32(y)) + movq t2, 72(z) + +// Result term 10 + + combads(t1,t0,40(x),40(y)) + movq t0, 80(z) + +// Result term 11 + + movq t1, 88(z) + +// Return + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_8_16.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_8_16.S new file mode 100644 index 00000000000..598fccd51d5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_8_16.S @@ -0,0 +1,260 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[8], y[8]; output z[16] +// +// extern void bignum_mul_8_16 +// (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// Copied in or set up + +#define y %rcx + +// A zero register + +#define zero %rbp +#define zeroe %ebp + +// mulpadd i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j + +.macro mulpadd arg1,arg2 + mulxq 8*\arg1(x), %rax, %rbx +.if ((\arg1 + \arg2) % 8 == 0) + adcxq %rax, %r8 + adoxq %rbx, %r9 +.elseif ((\arg1 + \arg2) % 8 == 1) + adcxq %rax, %r9 + adoxq %rbx, %r10 +.elseif ((\arg1 + \arg2) % 8 == 2) + adcxq %rax, %r10 + adoxq %rbx, %r11 +.elseif ((\arg1 + \arg2) % 8 == 3) + adcxq %rax, %r11 + adoxq %rbx, %r12 +.elseif ((\arg1 + \arg2) % 8 == 4) + adcxq %rax, %r12 + adoxq %rbx, %r13 +.elseif ((\arg1 + \arg2) % 8 == 5) + adcxq %rax, %r13 + adoxq %rbx, %r14 +.elseif ((\arg1 + \arg2) % 8 == 6) + adcxq %rax, %r14 + adoxq %rbx, %r15 +.elseif ((\arg1 + \arg2) % 8 == 7) + adcxq %rax, %r15 + adoxq %rbx, %r8 +.endif + +.endm + +// mulpade i, j adds x[i] * rdx (now assumed = y[j]) into the window at i+j +// but re-creates the top word assuming nothing to add there + +.macro mulpade arg1,arg2 +.if ((\arg1 + \arg2) % 8 == 0) + mulxq 8*\arg1(x), %rax, %r9 + adcxq %rax, %r8 + adoxq zero, %r9 +.elseif ((\arg1 + \arg2) % 8 == 1) + mulxq 8*\arg1(x), %rax, %r10 + adcxq %rax, %r9 + adoxq zero, %r10 +.elseif ((\arg1 + \arg2) % 8 == 2) + mulxq 8*\arg1(x), %rax, %r11 + adcxq %rax, %r10 + adoxq zero, %r11 +.elseif ((\arg1 + \arg2) % 8 == 3) + mulxq 8*\arg1(x), %rax, %r12 + adcxq %rax, %r11 + adoxq zero, %r12 +.elseif ((\arg1 + \arg2) % 8 == 4) + mulxq 8*\arg1(x), %rax, %r13 + adcxq %rax, %r12 + adoxq zero, %r13 +.elseif ((\arg1 + \arg2) % 8 == 5) + mulxq 8*\arg1(x), %rax, %r14 + adcxq %rax, %r13 + adoxq zero, %r14 +.elseif ((\arg1 + \arg2) % 8 == 6) + mulxq 8*\arg1(x), %rax, %r15 + adcxq %rax, %r14 + adoxq zero, %r15 +.elseif ((\arg1 + \arg2) % 8 == 7) + mulxq 8*\arg1(x), %rax, %r8 + adcxq %rax, %r15 + adoxq zero, %r8 +.endif + +.endm + +// Add in the whole j'th row + +.macro addrow arg1 + movq 8*\arg1(y), %rdx + xorl zeroe, zeroe + + mulpadd 0, \arg1 + +.if (\arg1 % 8 == 0) + movq %r8, 8*\arg1(z) +.elseif (\arg1 % 8 == 1) + movq %r9, 8*\arg1(z) +.elseif (\arg1 % 8 == 2) + movq %r10, 8*\arg1(z) +.elseif (\arg1 % 8 == 3) + movq %r11, 8*\arg1(z) +.elseif (\arg1 % 8 == 4) + movq %r12, 8*\arg1(z) +.elseif (\arg1 % 8 == 5) + movq %r13, 8*\arg1(z) +.elseif (\arg1 % 8 == 6) + movq %r14, 8*\arg1(z) +.elseif (\arg1 % 8 == 7) + movq %r15, 8*\arg1(z) +.endif + + mulpadd 1, \arg1 + mulpadd 2, \arg1 + mulpadd 3, \arg1 + mulpadd 4, \arg1 + mulpadd 5, \arg1 + mulpadd 6, \arg1 + mulpade 7, \arg1 + +.if (\arg1 % 8 == 0) + adcq zero, %r8 +.elseif (\arg1 % 8 == 1) + adcq zero, %r9 +.elseif (\arg1 % 8 == 2) + adcq zero, %r10 +.elseif (\arg1 % 8 == 3) + adcq zero, %r11 +.elseif (\arg1 % 8 == 4) + adcq zero, %r12 +.elseif (\arg1 % 8 == 5) + adcq zero, %r13 +.elseif (\arg1 % 8 == 6) + adcq zero, %r14 +.elseif (\arg1 % 8 == 7) + adcq zero, %r15 +.endif + +.endm + + +S2N_BN_SYMBOL(bignum_mul_8_16): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Copy y into a safe register to start with + + movq %rdx, y + +// Zero a register, which also makes sure we don't get a fake carry-in + + xorl zeroe, zeroe + +// Do the zeroth row, which is a bit different +// Write back the zero-zero product and then accumulate +// %r8,%r15,%r14,%r13,%r12,%r11,%r10,%r9 as y[0] * x from 1..8 + + movq (y), %rdx + + mulxq (x), %r8, %r9 + movq %r8, (z) + + mulxq 8(x), %rbx, %r10 + adcq %rbx, %r9 + + mulxq 16(x), %rbx, %r11 + adcq %rbx, %r10 + + mulxq 24(x), %rbx, %r12 + adcq %rbx, %r11 + + mulxq 32(x), %rbx, %r13 + adcq %rbx, %r12 + + mulxq 40(x), %rbx, %r14 + adcq %rbx, %r13 + + mulxq 48(x), %rbx, %r15 + adcq %rbx, %r14 + + mulxq 56(x), %rbx, %r8 + adcq %rbx, %r15 + adcq zero, %r8 + +// Now all the other rows in a uniform pattern + + addrow 1 + addrow 2 + addrow 3 + addrow 4 + addrow 5 + addrow 6 + addrow 7 + +// Now write back the additional columns + + movq %r8, 64(z) + movq %r9, 72(z) + movq %r10, 80(z) + movq %r11, 88(z) + movq %r12, 96(z) + movq %r13, 104(z) + movq %r14, 112(z) + movq %r15, 120(z) + +// Real epilog + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_8_16_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_8_16_alt.S new file mode 100644 index 00000000000..a1a2a67e714 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_mul_8_16_alt.S @@ -0,0 +1,233 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[8], y[8]; output z[16] +// +// extern void bignum_mul_8_16_alt +// (uint64_t z[static 16], uint64_t x[static 8], uint64_t y[static 8]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_8_16_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_8_16_alt) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// This is moved from %rdx to free it for muls + +#define y %rcx + +// Other variables used as a rotating 3-word window to add terms to + +#define t0 %r8 +#define t1 %r9 +#define t2 %r10 + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A minutely shorter form for when c = 0 initially + +#define combadz(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq c, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h + +S2N_BN_SYMBOL(bignum_mul_8_16_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Copy y into a safe register to start with + + movq %rdx, y + +// Result term 0 + + movq (x), %rax + mulq (y) + + movq %rax, (z) + movq %rdx, t0 + xorq t1, t1 + +// Result term 1 + + xorq t2, t2 + combads(t1,t0,(x),8(y)) + combadz(t2,t1,t0,8(x),(y)) + movq t0, 8(z) + +// Result term 2 + + xorq t0, t0 + combadz(t0,t2,t1,(x),16(y)) + combadd(t0,t2,t1,8(x),8(y)) + combadd(t0,t2,t1,16(x),(y)) + movq t1, 16(z) + +// Result term 3 + + xorq t1, t1 + combadz(t1,t0,t2,(x),24(y)) + combadd(t1,t0,t2,8(x),16(y)) + combadd(t1,t0,t2,16(x),8(y)) + combadd(t1,t0,t2,24(x),(y)) + movq t2, 24(z) + +// Result term 4 + + xorq t2, t2 + combadz(t2,t1,t0,(x),32(y)) + combadd(t2,t1,t0,8(x),24(y)) + combadd(t2,t1,t0,16(x),16(y)) + combadd(t2,t1,t0,24(x),8(y)) + combadd(t2,t1,t0,32(x),(y)) + movq t0, 32(z) + +// Result term 5 + + xorq t0, t0 + combadz(t0,t2,t1,(x),40(y)) + combadd(t0,t2,t1,8(x),32(y)) + combadd(t0,t2,t1,16(x),24(y)) + combadd(t0,t2,t1,24(x),16(y)) + combadd(t0,t2,t1,32(x),8(y)) + combadd(t0,t2,t1,40(x),(y)) + movq t1, 40(z) + +// Result term 6 + + xorq t1, t1 + combadz(t1,t0,t2,(x),48(y)) + combadd(t1,t0,t2,8(x),40(y)) + combadd(t1,t0,t2,16(x),32(y)) + combadd(t1,t0,t2,24(x),24(y)) + combadd(t1,t0,t2,32(x),16(y)) + combadd(t1,t0,t2,40(x),8(y)) + combadd(t1,t0,t2,48(x),(y)) + movq t2, 48(z) + +// Result term 7 + + xorq t2, t2 + combadz(t2,t1,t0,(x),56(y)) + combadd(t2,t1,t0,8(x),48(y)) + combadd(t2,t1,t0,16(x),40(y)) + combadd(t2,t1,t0,24(x),32(y)) + combadd(t2,t1,t0,32(x),24(y)) + combadd(t2,t1,t0,40(x),16(y)) + combadd(t2,t1,t0,48(x),8(y)) + combadd(t2,t1,t0,56(x),(y)) + movq t0, 56(z) + +// Result term 8 + + xorq t0, t0 + combadz(t0,t2,t1,8(x),56(y)) + combadd(t0,t2,t1,16(x),48(y)) + combadd(t0,t2,t1,24(x),40(y)) + combadd(t0,t2,t1,32(x),32(y)) + combadd(t0,t2,t1,40(x),24(y)) + combadd(t0,t2,t1,48(x),16(y)) + combadd(t0,t2,t1,56(x),8(y)) + movq t1, 64(z) + +// Result term 9 + + xorq t1, t1 + combadz(t1,t0,t2,16(x),56(y)) + combadd(t1,t0,t2,24(x),48(y)) + combadd(t1,t0,t2,32(x),40(y)) + combadd(t1,t0,t2,40(x),32(y)) + combadd(t1,t0,t2,48(x),24(y)) + combadd(t1,t0,t2,56(x),16(y)) + movq t2, 72(z) + +// Result term 10 + + xorq t2, t2 + combadz(t2,t1,t0,24(x),56(y)) + combadd(t2,t1,t0,32(x),48(y)) + combadd(t2,t1,t0,40(x),40(y)) + combadd(t2,t1,t0,48(x),32(y)) + combadd(t2,t1,t0,56(x),24(y)) + movq t0, 80(z) + +// Result term 11 + + xorq t0, t0 + combadz(t0,t2,t1,32(x),56(y)) + combadd(t0,t2,t1,40(x),48(y)) + combadd(t0,t2,t1,48(x),40(y)) + combadd(t0,t2,t1,56(x),32(y)) + movq t1, 88(z) + +// Result term 12 + + xorq t1, t1 + combadz(t1,t0,t2,40(x),56(y)) + combadd(t1,t0,t2,48(x),48(y)) + combadd(t1,t0,t2,56(x),40(y)) + movq t2, 96(z) + +// Result term 13 + + xorq t2, t2 + combadz(t2,t1,t0,48(x),56(y)) + combadd(t2,t1,t0,56(x),48(y)) + movq t0, 104(z) + +// Result term 14 + + combads(t2,t1,56(x),56(y)) + movq t1, 112(z) + +// Result term 11 + + movq t2, 120(z) + +// Return + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_4_8.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_4_8.S new file mode 100644 index 00000000000..4b19675569b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_4_8.S @@ -0,0 +1,145 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[4]; output z[8] +// +// extern void bignum_sqr_4_8 (uint64_t z[static 8], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// A zero register + +#define zero %rbp +#define zeroe %ebp + +// Other registers + +#define d1 %r8 +#define d2 %r9 +#define d3 %r10 +#define d4 %r11 +#define d5 %r12 +#define d6 %r13 + + + +S2N_BN_SYMBOL(bignum_sqr_4_8): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %rbp + pushq %r12 + pushq %r13 + +// Set up an initial window [d6;...d1] = [23;03;01] + + movq (x), %rdx + mulxq 8(x), d1, d2 + mulxq 24(x), d3, d4 + movq 16(x), %rdx + mulxq 24(x), d5, d6 + +// Clear our zero register, and also initialize the flags for the carry chain + + xorl zeroe, zeroe + +// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible) +// This gives all the "heterogeneous" terms of the squaring ready to double + + mulxq (x), %rax, %rcx + adcxq %rax, d2 + adoxq %rcx, d3 + mulxq 8(x), %rax, %rcx + adcxq %rax, d3 + adoxq %rcx, d4 + movq 24(x), %rdx + mulxq 8(x), %rax, %rcx + adcxq %rax, d4 + adoxq %rcx, d5 + adcxq zero, d5 + adoxq zero, d6 + adcxq zero, d6 + +// In principle this is otiose as CF and OF carries are absorbed at this point +// However it seems helpful for the OOO engine to be told it's a fresh start + + xorl zeroe, zeroe + +// Double and add to the 00 + 11 + 22 + 33 terms +// +// We could use shift-double but this seems tidier and in larger squarings +// it was actually more efficient. I haven't experimented with this small +// case to see how much that matters. Note: the writeback here is sprinkled +// into the sequence in such a way that things still work if z = x, i.e. if +// the output overwrites the input buffer and beyond. + + movq (x), %rdx + mulxq %rdx, %rax, %rdx + movq %rax, (z) + adcxq d1, d1 + adoxq %rdx, d1 + movq 8(x), %rdx + movq d1, 8(z) + mulxq %rdx, %rax, %rdx + adcxq d2, d2 + adoxq %rax, d2 + adcxq d3, d3 + adoxq %rdx, d3 + movq 16(x), %rdx + movq d2, 16(z) + mulxq %rdx, %rax, %rdx + adcxq d4, d4 + adoxq %rax, d4 + adcxq d5, d5 + adoxq %rdx, d5 + movq 24(x), %rdx + movq d3, 24(z) + mulxq %rdx, %rax, %rdx + movq d4, 32(z) + adcxq d6, d6 + movq d5, 40(z) + adoxq %rax, d6 + movq d6, 48(z) + adcxq zero, %rdx + adoxq zero, %rdx + movq %rdx, 56(z) + +// Restore saved registers and return + + popq %r13 + popq %r12 + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_4_8_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_4_8_alt.S new file mode 100644 index 00000000000..693a57d74d1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_4_8_alt.S @@ -0,0 +1,134 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[4]; output z[8] +// +// extern void bignum_sqr_4_8_alt +// (uint64_t z[static 8], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_4_8_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_4_8_alt) + .text + +// Input arguments + +#define z %rdi +#define x %rsi + +// Other variables used as a rotating 3-word window to add terms to + +#define t0 %rcx +#define t1 %r8 +#define t2 %r9 + +// Macro for the key "multiply and add to (c,h,l)" step, for square term + +#define combadd1(c,h,l,numa) \ + movq numa, %rax ; \ + mulq %rax; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa) \ + movq numa, %rax ; \ + mulq %rax; \ + addq %rax, l ; \ + adcq %rdx, h + +// A version doubling before adding, for non-square terms + +#define combadd2(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0, c ; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +S2N_BN_SYMBOL(bignum_sqr_4_8_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Result term 0 + + movq (x), %rax + mulq %rax + + movq %rax, (z) + movq %rdx, t0 + xorq t1, t1 + +// Result term 1 + + xorq t2, t2 + combadd2(t2,t1,t0,(x),8(x)) + movq t0, 8(z) + +// Result term 2 + + xorq t0, t0 + combadd1(t0,t2,t1,8(x)) + combadd2(t0,t2,t1,(x),16(x)) + movq t1, 16(z) + +// Result term 3 + + xorq t1, t1 + combadd2(t1,t0,t2,(x),24(x)) + combadd2(t1,t0,t2,8(x),16(x)) + movq t2, 24(z) + +// Result term 4 + + xorq t2, t2 + combadd2(t2,t1,t0,8(x),24(x)) + combadd1(t2,t1,t0,16(x)) + movq t0, 32(z) + +// Result term 5 + + xorq t0, t0 + combadd2(t0,t2,t1,16(x),24(x)) + movq t1, 40(z) + +// Result term 6 + + xorq t1, t1 + combads(t0,t2,24(x)) + movq t2, 48(z) + +// Result term 7 + + movq t0, 56(z) + +// Return + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_6_12.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_6_12.S new file mode 100644 index 00000000000..f0abc6480d3 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_6_12.S @@ -0,0 +1,214 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[6]; output z[12] +// +// extern void bignum_sqr_6_12 (uint64_t z[static 12], uint64_t x[static 6]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// A zero register + +#define zero %rbp +#define zeroe %ebp + +// Other registers + +#define d1 %r8 +#define d2 %r9 +#define d3 %r10 +#define d4 %r11 +#define d5 %r12 +#define d6 %r13 +#define d7 %r14 +#define d8 %r15 +#define d9 %rbx + +// Care is needed: re-using the zero register + +#define d10 %rbp + + +S2N_BN_SYMBOL(bignum_sqr_6_12): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Set up an initial window [d8;...d1] = [34;05;03;01] + + movq (x), %rdx + mulxq 8(x), d1, d2 + mulxq 24(x), d3, d4 + mulxq 40(x), d5, d6 + movq 24(x), %rdx + mulxq 32(x), d7, d8 + +// Clear our zero register, and also initialize the flags for the carry chain + + xorl zeroe, zeroe + +// Chain in the addition of 02 + 12 + 13 + 14 + 15 to that window +// (no carry-out possible since we add it to the top of a product) + + movq 16(x), %rdx + mulxq (x), %rax, %rcx + adcxq %rax, d2 + adoxq %rcx, d3 + mulxq 8(x), %rax, %rcx + adcxq %rax, d3 + adoxq %rcx, d4 + movq 8(x), %rdx + mulxq 24(x), %rax, %rcx + adcxq %rax, d4 + adoxq %rcx, d5 + mulxq 32(x), %rax, %rcx + adcxq %rax, d5 + adoxq %rcx, d6 + mulxq 40(x), %rax, %rcx + adcxq %rax, d6 + adoxq %rcx, d7 + adcxq zero, d7 + adoxq zero, d8 + adcxq zero, d8 + +// Again zero out the flags. Actually they are already cleared but it may +// help decouple these in the OOO engine not to wait for the chain above + + xorl zeroe, zeroe + +// Now chain in the 04 + 23 + 24 + 25 + 35 + 45 terms +// We are running out of registers and here our zero register is not zero! + + movq 32(x), %rdx + mulxq (x), %rax, %rcx + adcxq %rax, d4 + adoxq %rcx, d5 + movq 16(x), %rdx + mulxq 24(x), %rax, %rcx + adcxq %rax, d5 + adoxq %rcx, d6 + mulxq 32(x), %rax, %rcx + adcxq %rax, d6 + adoxq %rcx, d7 + mulxq 40(x), %rax, %rcx + adcxq %rax, d7 + adoxq %rcx, d8 + movq 24(x), %rdx + mulxq 40(x), %rax, d9 + adcxq %rax, d8 + adoxq zero, d9 + movq 32(x), %rdx + mulxq 40(x), %rax, d10 + adcxq %rax, d9 + movl $0, %eax + adoxq %rax, d10 + adcxq %rax, d10 + +// Again, just for a clear fresh start for the flags + + xorl %eax, %eax + +// Double and add to the 00 + 11 + 22 + 33 + 44 + 55 terms +// +// We could use shift-double but this seems tidier and in larger squarings +// it was actually more efficient. I haven't experimented with this small +// case to see how much that matters. Note: the writeback here is sprinkled +// into the sequence in such a way that things still work if z = x, i.e. if +// the output overwrites the input buffer and beyond. + + movq (x), %rdx + mulxq %rdx, %rax, %rdx + movq %rax, (z) + adcxq d1, d1 + adoxq %rdx, d1 + movq 8(x), %rdx + movq d1, 8(z) + mulxq %rdx, %rax, %rdx + adcxq d2, d2 + adoxq %rax, d2 + adcxq d3, d3 + adoxq %rdx, d3 + movq 16(x), %rdx + movq d2, 16(z) + mulxq %rdx, %rax, %rdx + adcxq d4, d4 + adoxq %rax, d4 + adcxq d5, d5 + adoxq %rdx, d5 + movq 24(x), %rdx + movq d3, 24(z) + mulxq %rdx, %rax, %rdx + adcxq d6, d6 + adoxq %rax, d6 + adcxq d7, d7 + adoxq %rdx, d7 + movq 32(x), %rdx + movq d4, 32(z) + mulxq %rdx, %rax, %rdx + adcxq d8, d8 + adoxq %rax, d8 + adcxq d9, d9 + adoxq %rdx, d9 + movq 40(x), %rdx + movq d5, 40(z) + mulxq %rdx, %rax, %rdx + movq d6, 48(z) + adcxq d10, d10 + movq d7, 56(z) + adoxq %rax, d10 + movq d8, 64(z) + movl $0, %eax + movq d9, 72(z) + adcxq %rax, %rdx + movq d10, 80(z) + adoxq %rax, %rdx + movq %rdx, 88(z) + +// Restore saved registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_6_12_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_6_12_alt.S new file mode 100644 index 00000000000..f576b42e165 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_6_12_alt.S @@ -0,0 +1,196 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[6]; output z[12] +// +// extern void bignum_sqr_6_12_alt (uint64_t z[static 12], uint64_t x[static 6]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_6_12_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_6_12_alt) + .text + +// Input arguments + +#define z %rdi +#define x %rsi + +// Other variables used as a rotating 3-word window to add terms to + +#define t0 %r8 +#define t1 %r9 +#define t2 %r10 + +// Additional temporaries for local windows to share doublings + +#define u0 %rcx +#define u1 %r11 + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// Set up initial window (c,h,l) = numa * numb + +#define combaddz(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + xorq c, c ; \ + movq %rax, l ; \ + movq %rdx, h + +// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l) + +#define doubladd(c,h,l,hh,ll) \ + addq ll, ll ; \ + adcq hh, hh ; \ + adcq c, c ; \ + addq ll, l ; \ + adcq hh, h ; \ + adcq $0, c + +// Square term incorporation (c,h,l) += numba^2 + +#define combadd1(c,h,l,numa) \ + movq numa, %rax ; \ + mulq %rax; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa) \ + movq numa, %rax ; \ + mulq %rax; \ + addq %rax, l ; \ + adcq %rdx, h + +// A version doubling directly before adding, for single non-square terms + +#define combadd2(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0, c ; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +S2N_BN_SYMBOL(bignum_sqr_6_12_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Result term 0 + + movq (x), %rax + mulq %rax + + movq %rax, (z) + movq %rdx, t0 + xorq t1, t1 + +// Result term 1 + + xorq t2, t2 + combadd2(t2,t1,t0,(x),8(x)) + movq t0, 8(z) + +// Result term 2 + + xorq t0, t0 + combadd1(t0,t2,t1,8(x)) + combadd2(t0,t2,t1,(x),16(x)) + movq t1, 16(z) + +// Result term 3 + + combaddz(t1,u1,u0,(x),24(x)) + combadd(t1,u1,u0,8(x),16(x)) + doubladd(t1,t0,t2,u1,u0) + movq t2, 24(z) + +// Result term 4 + + combaddz(t2,u1,u0,(x),32(x)) + combadd(t2,u1,u0,8(x),24(x)) + doubladd(t2,t1,t0,u1,u0) + combadd1(t2,t1,t0,16(x)) + movq t0, 32(z) + +// Result term 5 + + combaddz(t0,u1,u0,(x),40(x)) + combadd(t0,u1,u0,8(x),32(x)) + combadd(t0,u1,u0,16(x),24(x)) + doubladd(t0,t2,t1,u1,u0) + movq t1, 40(z) + +// Result term 6 + + combaddz(t1,u1,u0,8(x),40(x)) + combadd(t1,u1,u0,16(x),32(x)) + doubladd(t1,t0,t2,u1,u0) + combadd1(t1,t0,t2,24(x)) + movq t2, 48(z) + +// Result term 7 + + combaddz(t2,u1,u0,16(x),40(x)) + combadd(t2,u1,u0,24(x),32(x)) + doubladd(t2,t1,t0,u1,u0) + movq t0, 56(z) + +// Result term 8 + + xorq t0, t0 + combadd2(t0,t2,t1,24(x),40(x)) + combadd1(t0,t2,t1,32(x)) + movq t1, 64(z) + +// Result term 9 + + xorq t1, t1 + combadd2(t1,t0,t2,32(x),40(x)) + movq t2, 72(z) + +// Result term 10 + + combads(t1,t0,40(x)) + movq t0, 80(z) + +// Result term 11 + + movq t1, 88(z) + +// Return + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_8_16.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_8_16.S new file mode 100644 index 00000000000..b90101c0887 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_8_16.S @@ -0,0 +1,298 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[8]; output z[16] +// +// extern void bignum_sqr_8_16 (uint64_t z[static 16], uint64_t x[static 8]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// A zero register + +#define zero %rbp +#define zeroe %ebp + +// mulpadd i, j adds rdx * x[i] into the window at the i+j point + +.macro mulpadd arg1,arg2 + mulxq 8*\arg1(x), %rax, %rcx +.if ((\arg1 + \arg2) % 8 == 0) + adcxq %rax, %r8 + adoxq %rcx, %r9 +.elseif ((\arg1 + \arg2) % 8 == 1) + adcxq %rax, %r9 + adoxq %rcx, %r10 +.elseif ((\arg1 + \arg2) % 8 == 2) + adcxq %rax, %r10 + adoxq %rcx, %r11 +.elseif ((\arg1 + \arg2) % 8 == 3) + adcxq %rax, %r11 + adoxq %rcx, %r12 +.elseif ((\arg1 + \arg2) % 8 == 4) + adcxq %rax, %r12 + adoxq %rcx, %r13 +.elseif ((\arg1 + \arg2) % 8 == 5) + adcxq %rax, %r13 + adoxq %rcx, %r14 +.elseif ((\arg1 + \arg2) % 8 == 6) + adcxq %rax, %r14 + adoxq %rcx, %r15 +.elseif ((\arg1 + \arg2) % 8 == 7) + adcxq %rax, %r15 + adoxq %rcx, %r8 +.endif + +.endm + +// mulpade i, j adds rdx * x[i] into the window at i+j +// but re-creates the top word assuming nothing to add there + +.macro mulpade arg1,arg2 +.if ((\arg1 + \arg2) % 8 == 0) + mulxq 8*\arg1(x), %rax, %r9 + adcxq %rax, %r8 + adoxq zero, %r9 +.elseif ((\arg1 + \arg2) % 8 == 1) + mulxq 8*\arg1(x), %rax, %r10 + adcxq %rax, %r9 + adoxq zero, %r10 +.elseif ((\arg1 + \arg2) % 8 == 2) + mulxq 8*\arg1(x), %rax, %r11 + adcxq %rax, %r10 + adoxq zero, %r11 +.elseif ((\arg1 + \arg2) % 8 == 3) + mulxq 8*\arg1(x), %rax, %r12 + adcxq %rax, %r11 + adoxq zero, %r12 +.elseif ((\arg1 + \arg2) % 8 == 4) + mulxq 8*\arg1(x), %rax, %r13 + adcxq %rax, %r12 + adoxq zero, %r13 +.elseif ((\arg1 + \arg2) % 8 == 5) + mulxq 8*\arg1(x), %rax, %r14 + adcxq %rax, %r13 + adoxq zero, %r14 +.elseif ((\arg1 + \arg2) % 8 == 6) + mulxq 8*\arg1(x), %rax, %r15 + adcxq %rax, %r14 + adoxq zero, %r15 +.elseif ((\arg1 + \arg2) % 8 == 7) + mulxq 8*\arg1(x), %rax, %r8 + adcxq %rax, %r15 + adoxq zero, %r8 +.endif + +.endm + +.macro diagonals + + xorl zeroe, zeroe + +// Set initial window [%r8..%r10] + 2 wb = 10 + 20 + 30 + 40 + 50 + 60 + 70 + + movq (x), %rdx + mulxq 8(x), %r9, %rax + movq %r9, 8(z) + mulxq 16(x), %r10, %rcx + adcxq %rax, %r10 + movq %r10, 16(z) + mulxq 24(x), %r11, %rax + adcxq %rcx, %r11 + mulxq 32(x), %r12, %rcx + adcxq %rax, %r12 + mulxq 40(x), %r13, %rax + adcxq %rcx, %r13 + mulxq 48(x), %r14, %rcx + adcxq %rax, %r14 + mulxq 56(x), %r15, %r8 + adcxq %rcx, %r15 + adcxq zero, %r8 + +// Add in the next diagonal = 21 + 31 + 41 + 51 + 61 + 71 + 54 + + xorl zeroe, zeroe + movq 8(x), %rdx + mulpadd 2, 1 + movq %r11, 24(z) + mulpadd 3, 1 + movq %r12, 32(z) + mulpadd 4, 1 + mulpadd 5, 1 + mulpadd 6, 1 + mulpade 7, 1 + movq 32(x), %rdx + mulpade 5, 4 + adcxq zero, %r10 + +// And the next one = 32 + 42 + 52 + 62 + 72 + 64 + 65 + + xorl zeroe, zeroe + movq 16(x), %rdx + mulpadd 3, 2 + movq %r13, 40(z) + mulpadd 4, 2 + movq %r14, 48(z) + mulpadd 5, 2 + mulpadd 6, 2 + mulpadd 7, 2 + movq 48(x), %rdx + mulpade 4, 6 + mulpade 5, 6 + adcxq zero, %r12 + +// And the final one = 43 + 53 + 63 + 73 + 74 + 75 + 76 + + xorl zeroe, zeroe + movq 24(x), %rdx + mulpadd 4, 3 + movq %r15, 56(z) + mulpadd 5, 3 + movq %r8, 64(z) + mulpadd 6, 3 + mulpadd 7, 3 + movq 56(x), %rdx + mulpadd 4, 7 + mulpade 5, 7 + mulpade 6, 7 + adcxq zero, %r14 + +// Double and add things; use z[1]..z[8] and thereafter the registers +// %r9..%r15 which haven't been written back yet + + xorl zeroe, zeroe + movq (x), %rdx + mulxq %rdx, %rax, %rcx + movq %rax, (z) + movq 8(z), %rax + adcxq %rax, %rax + adoxq %rcx, %rax + movq %rax, 8(z) + + movq 16(z), %rax + movq 8(x), %rdx + mulxq %rdx, %rdx, %rcx + adcxq %rax, %rax + adoxq %rdx, %rax + movq %rax, 16(z) + movq 24(z), %rax + adcxq %rax, %rax + adoxq %rcx, %rax + movq %rax, 24(z) + + movq 32(z), %rax + movq 16(x), %rdx + mulxq %rdx, %rdx, %rcx + adcxq %rax, %rax + adoxq %rdx, %rax + movq %rax, 32(z) + movq 40(z), %rax + adcxq %rax, %rax + adoxq %rcx, %rax + movq %rax, 40(z) + + movq 48(z), %rax + movq 24(x), %rdx + mulxq %rdx, %rdx, %rcx + adcxq %rax, %rax + adoxq %rdx, %rax + movq %rax, 48(z) + movq 56(z), %rax + adcxq %rax, %rax + adoxq %rcx, %rax + movq %rax, 56(z) + + movq 64(z), %rax + movq 32(x), %rdx + mulxq %rdx, %rdx, %rcx + adcxq %rax, %rax + adoxq %rdx, %rax + movq %rax, 64(z) + adcxq %r9, %r9 + adoxq %rcx, %r9 + movq %r9, 72(z) + + movq 40(x), %rdx + mulxq %rdx, %rdx, %rcx + adcxq %r10, %r10 + adoxq %rdx, %r10 + movq %r10, 80(z) + adcxq %r11, %r11 + adoxq %rcx, %r11 + movq %r11, 88(z) + + movq 48(x), %rdx + mulxq %rdx, %rdx, %rcx + adcxq %r12, %r12 + adoxq %rdx, %r12 + movq %r12, 96(z) + adcxq %r13, %r13 + adoxq %rcx, %r13 + movq %r13, 104(z) + + movq 56(x), %rdx + mulxq %rdx, %rdx, %r15 + adcxq %r14, %r14 + adoxq %rdx, %r14 + movq %r14, 112(z) + adcxq zero, %r15 + adoxq zero, %r15 + movq %r15, 120(z) + +.endm + + +S2N_BN_SYMBOL(bignum_sqr_8_16): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Do the multiplication + + diagonals + +// Real epilog + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_8_16_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_8_16_alt.S new file mode 100644 index 00000000000..2991033f49d --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/fastmul/bignum_sqr_8_16_alt.S @@ -0,0 +1,231 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square, z := x^2 +// Input x[8]; output z[16] +// +// extern void bignum_sqr_8_16_alt (uint64_t z[static 16], uint64_t x[static 8]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_8_16_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_8_16_alt) + .text + +// Input arguments + +#define z %rdi +#define x %rsi + +// Other variables used as a rotating 3-word window to add terms to + +#define t0 %r8 +#define t1 %r9 +#define t2 %r10 + +// Additional temporaries for local windows to share doublings + +#define u0 %rcx +#define u1 %r11 + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// Set up initial window (c,h,l) = numa * numb + +#define combaddz(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + xorq c, c ; \ + movq %rax, l ; \ + movq %rdx, h + +// Doubling step (c,h,l) = 2 * (c,hh,ll) + (0,h,l) + +#define doubladd(c,h,l,hh,ll) \ + addq ll, ll ; \ + adcq hh, hh ; \ + adcq c, c ; \ + addq ll, l ; \ + adcq hh, h ; \ + adcq $0, c + +// Square term incorporation (c,h,l) += numba^2 + +#define combadd1(c,h,l,numa) \ + movq numa, %rax ; \ + mulq %rax; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa) \ + movq numa, %rax ; \ + mulq %rax; \ + addq %rax, l ; \ + adcq %rdx, h + +// A version doubling directly before adding, for single non-square terms + +#define combadd2(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0, c ; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +S2N_BN_SYMBOL(bignum_sqr_8_16_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Result term 0 + + movq (x), %rax + mulq %rax + + movq %rax, (z) + movq %rdx, t0 + xorq t1, t1 + +// Result term 1 + + xorq t2, t2 + combadd2(t2,t1,t0,(x),8(x)) + movq t0, 8(z) + +// Result term 2 + + xorq t0, t0 + combadd1(t0,t2,t1,8(x)) + combadd2(t0,t2,t1,(x),16(x)) + movq t1, 16(z) + +// Result term 3 + + combaddz(t1,u1,u0,(x),24(x)) + combadd(t1,u1,u0,8(x),16(x)) + doubladd(t1,t0,t2,u1,u0) + movq t2, 24(z) + +// Result term 4 + + combaddz(t2,u1,u0,(x),32(x)) + combadd(t2,u1,u0,8(x),24(x)) + doubladd(t2,t1,t0,u1,u0) + combadd1(t2,t1,t0,16(x)) + movq t0, 32(z) + +// Result term 5 + + combaddz(t0,u1,u0,(x),40(x)) + combadd(t0,u1,u0,8(x),32(x)) + combadd(t0,u1,u0,16(x),24(x)) + doubladd(t0,t2,t1,u1,u0) + movq t1, 40(z) + +// Result term 6 + + combaddz(t1,u1,u0,(x),48(x)) + combadd(t1,u1,u0,8(x),40(x)) + combadd(t1,u1,u0,16(x),32(x)) + doubladd(t1,t0,t2,u1,u0) + combadd1(t1,t0,t2,24(x)) + movq t2, 48(z) + +// Result term 7 + + combaddz(t2,u1,u0,(x),56(x)) + combadd(t2,u1,u0,8(x),48(x)) + combadd(t2,u1,u0,16(x),40(x)) + combadd(t2,u1,u0,24(x),32(x)) + doubladd(t2,t1,t0,u1,u0) + movq t0, 56(z) + +// Result term 8 + + combaddz(t0,u1,u0,8(x),56(x)) + combadd(t0,u1,u0,16(x),48(x)) + combadd(t0,u1,u0,24(x),40(x)) + doubladd(t0,t2,t1,u1,u0) + combadd1(t0,t2,t1,32(x)) + movq t1, 64(z) + +// Result term 9 + + combaddz(t1,u1,u0,16(x),56(x)) + combadd(t1,u1,u0,24(x),48(x)) + combadd(t1,u1,u0,32(x),40(x)) + doubladd(t1,t0,t2,u1,u0) + movq t2, 72(z) + +// Result term 10 + + combaddz(t2,u1,u0,24(x),56(x)) + combadd(t2,u1,u0,32(x),48(x)) + doubladd(t2,t1,t0,u1,u0) + combadd1(t2,t1,t0,40(x)) + movq t0, 80(z) + +// Result term 11 + + combaddz(t0,u1,u0,32(x),56(x)) + combadd(t0,u1,u0,40(x),48(x)) + doubladd(t0,t2,t1,u1,u0) + movq t1, 88(z) + +// Result term 12 + + xorq t1, t1 + combadd2(t1,t0,t2,40(x),56(x)) + combadd1(t1,t0,t2,48(x)) + movq t2, 96(z) + +// Result term 13 + + xorq t2, t2 + combadd2(t2,t1,t0,48(x),56(x)) + movq t0, 104(z) + +// Result term 14 + + combads(t2,t1,56(x)) + movq t1, 112(z) + +// Result term 15 + + movq t2, 120(z) + +// Return + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_add.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_add.S new file mode 100644 index 00000000000..58851be2b25 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_add.S @@ -0,0 +1,154 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Add, z := x + y +// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] +// +// extern uint64_t bignum_add +// (uint64_t p, uint64_t *z, +// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Does the z := x + y operation, truncating modulo p words in general and +// returning a top carry (0 or 1) in the p'th place, only adding the input +// words below p (as well as m and n respectively) to get the sum and carry. +// +// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX +// Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add) + .text + +#define p %rdi +#define z %rsi +#define m %rdx +#define x %rcx +#define n %r8 +#define y %r9 +#define i %r10 +#define a %rax + +#define ashort %eax + + + +S2N_BN_SYMBOL(bignum_add): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 + movq 64(%rsp), %r9 +#endif + +// Zero the main index counter for both branches + + xorq i, i + +// First clamp the two input sizes m := min(p,m) and n := min(p,n) since +// we'll never need words past the p'th. Can now assume m <= p and n <= p. +// Then compare the modified m and n and branch accordingly + + cmpq m, p + cmovcq p, m + cmpq n, p + cmovcq p, n + cmpq n, m + jc bignum_add_ylonger + +// The case where x is longer or of the same size (p >= m >= n) + + subq m, p + subq n, m + incq m + testq n, n + jz bignum_add_xtest +bignum_add_xmainloop: + movq (x,i,8), a + adcq (y,i,8), a + movq a, (z,i,8) + incq i + decq n + jnz bignum_add_xmainloop + jmp bignum_add_xtest +bignum_add_xtoploop: + movq (x,i,8), a + adcq $0, a + movq a, (z,i,8) + incq i +bignum_add_xtest: + decq m + jnz bignum_add_xtoploop + movl $0, ashort + adcq $0, a + testq p, p + jnz bignum_add_tails +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +// The case where y is longer (p >= n > m) + +bignum_add_ylonger: + + subq n, p + subq m, n + testq m, m + jz bignum_add_ytoploop +bignum_add_ymainloop: + movq (x,i,8), a + adcq (y,i,8), a + movq a, (z,i,8) + incq i + decq m + jnz bignum_add_ymainloop +bignum_add_ytoploop: + movq (y,i,8), a + adcq $0, a + movq a, (z,i,8) + incq i + decq n + jnz bignum_add_ytoploop + movl $0, ashort + adcq $0, a + testq p, p + jnz bignum_add_tails +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +// Adding a non-trivial tail, when p > max(m,n) + +bignum_add_tails: + movq a, (z,i,8) + xorq a, a + jmp bignum_add_tail +bignum_add_tailloop: + movq a, (z,i,8) +bignum_add_tail: + incq i + decq p + jnz bignum_add_tailloop +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontifier.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontifier.S new file mode 100644 index 00000000000..08c910bb633 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontifier.S @@ -0,0 +1,465 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compute "amontification" constant z :== 2^{128k} (congruent mod m) +// Input m[k]; output z[k]; temporary buffer t[>=k] +// +// extern void bignum_amontifier +// (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t); +// +// This is called "amontifier" because any other value x can now be mapped into +// the almost-Montgomery domain with an almost-Montgomery multiplication by z. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = t +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = t +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontifier) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontifier) + .text + +#define k %rdi +#define z %rsi + +// These two inputs get moved to different places since RCX and RDX are special + +#define m %r12 +#define t %r13 + +// Other variables +// Matters that c is RCX as CL=lo(c) is assumed in shifts + +#define i %rbx +#define j %rbp +#define a %rax +#define c %rcx +#define h %r11 +#define l %r10 +#define b %r9 +#define n %r8 + +// Some aliases for the values b and n + +#define q %r8 +#define r %r9 + +#define ashort %eax +#define ishort %ebx +#define jshort %ebp +#define qshort %r8d + + +S2N_BN_SYMBOL(bignum_amontifier): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Save some additional registers for use, copy args out of RCX and RDX + + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + + movq %rdx, m + movq %rcx, t + +// If k = 0 the whole operation is trivial + + testq k, k + jz bignum_amontifier_end + +// Copy the input m into the temporary buffer t. The temporary register +// c matters since we want it to hold the highest digit, ready for the +// normalization phase. + + xorq i, i +bignum_amontifier_copyinloop: + movq (m,i,8), c + movq c, (t,i,8) + incq i + cmpq k, i + jc bignum_amontifier_copyinloop + +// Do a rather stupid but constant-time digit normalization, conditionally +// shifting left (k-1) times based on whether the top word is zero. +// With careful binary striding this could be O(k*log(k)) instead of O(k^2) +// while still retaining the constant-time style. +// The "neg c" sets the zeroness predicate (~CF) for the entire inner loop + + movq k, i + decq i + jz bignum_amontifier_normalized +bignum_amontifier_normloop: + xorq j, j + movq k, h + negq c + movl $0, ashort +bignum_amontifier_shufloop: + movq a, c + movq (t,j,8), a + cmovcq a, c + movq c, (t,j,8) + incq j + decq h + jnz bignum_amontifier_shufloop + decq i + jnz bignum_amontifier_normloop + +// We now have the top digit nonzero, assuming the input was nonzero, +// and as per the invariant of the loop above, c holds that digit. So +// now just count c's leading zeros and shift t bitwise that many bits. +// Note that we don't care about the result of bsr for zero inputs so +// the simple xor-ing with 63 is safe. + +bignum_amontifier_normalized: + + bsrq c, c + xorq $63, c + + xorq b, b + xorq i, i +bignum_amontifier_bitloop: + movq (t,i,8), a + movq a, j + shldq %cl, b, a + movq a, (t,i,8) + movq j, b + incq i + cmpq k, i + jc bignum_amontifier_bitloop + +// Let h be the high word of n, which in all the in-scope cases is >= 2^63. +// Now successively form q = 2^i div h and r = 2^i mod h as i goes from +// 64 to 126. We avoid just using division out of constant-time concerns +// (at the least we would need to fix up h = 0 for out-of-scope inputs) and +// don't bother with Newton-Raphson, since this stupid simple loop doesn't +// contribute much of the overall runtime at typical sizes. + + movq -8(t,k,8), h + movl $1, qshort + movq h, r + negq r + movl $62, ishort +bignum_amontifier_estloop: + + addq q, q + movq h, a + subq r, a + cmpq a, r // CF <=> r < h - r <=> 2 * r < h + sbbq a, a + notq a // a = bitmask(2 * r >= h) + subq a, q + addq r, r + andq h, a + subq a, r + decq i + jnz bignum_amontifier_estloop + +// Strictly speaking the above loop doesn't quite give the true remainder +// and quotient in the special case r = h = 2^63, so fix it up. We get +// q = 2^63 - 1 and r = 2^63 and really want q = 2^63 and r = 0. This is +// supererogatory, because the main property of q used below still holds +// in this case unless the initial m = 1, and then anyway the overall +// specification (congruence modulo m) holds degenerately. But it seems +// nicer to get a "true" quotient and remainder. + + incq r + cmpq r, h + adcq $0, q + +// So now we have q and r with 2^126 = q * h + r (imagining r = 0 in the +// fixed-up case above: note that we never actually use the computed +// value of r below and so didn't adjust it). And we can assume the ranges +// q <= 2^63 and r < h < 2^64. +// +// The idea is to use q as a first quotient estimate for a remainder +// of 2^{p+62} mod n, where p = 64 * k. We have, splitting n into the +// high and low parts h and l: +// +// 2^{p+62} - q * n = 2^{p+62} - q * (2^{p-64} * h + l) +// = 2^{p+62} - (2^{p-64} * (q * h) + q * l) +// = 2^{p+62} - 2^{p-64} * (2^126 - r) - q * l +// = 2^{p-64} * r - q * l +// +// Note that 2^{p-64} * r < 2^{p-64} * h <= n +// and also q * l < 2^63 * 2^{p-64} = 2^{p-1} <= n +// so |diff| = |2^{p-64} * r - q * l| < n. +// +// If in fact diff >= 0 then it is already 2^{p+62} mod n. +// otherwise diff + n is the right answer. +// +// To (maybe?) make the computation slightly easier we actually flip +// the sign and compute d = q * n - 2^{p+62}. Then the answer is either +// -d (when negative) or n - d; in either case we effectively negate d. +// This negating tweak in fact spoils the result for cases where +// 2^{p+62} mod n = 0, when we get n instead. However the only case +// where this can happen is m = 1, when the whole spec holds trivially, +// and actually the remainder of the logic below works anyway since +// the latter part of the code only needs a congruence for the k-digit +// result, not strict modular reduction (the doublings will maintain +// the non-strict inequality). + + xorq c, c + xorq i, i +bignum_amontifier_mulloop: + movq (t,i,8), %rax + mulq q + addq c, %rax + adcq $0, %rdx + movq %rax, (z,i,8) + movq %rdx, c + incq i + cmpq k, i + jc bignum_amontifier_mulloop + +// Now c is the high word of the product, so subtract 2^62 +// and then turn it into a bitmask in q = h + + movq $0x4000000000000000, %rax + subq a, c + sbbq q, q + notq q + +// Now do [c] * n - d for our final answer + + xorq c, c + xorq i, i +bignum_amontifier_remloop: + movq (t,i,8), a + andq q, a + negq c + sbbq (z,i,8), a + sbbq c, c + movq a, (z,i,8) + incq i + cmpq k, i + jc bignum_amontifier_remloop + +// Now still need to do a couple of modular doublings to get us all the +// way up to 2^{p+64} == r from initial 2^{p+62} == r (mod n). + + xorq c, c + xorq j, j + xorq b, b +bignum_amontifier_dubloop1: + movq (z,j,8), a + shrdq $63, a, c + negq b + sbbq (t,j,8), c + sbbq b, b + movq c, (z,j,8) + movq a, c + incq j + cmpq k, j + jc bignum_amontifier_dubloop1 + shrq $63, c + addq b, c + xorq j, j + xorq b, b +bignum_amontifier_corrloop1: + movq (t,j,8), a + andq c, a + negq b + adcq (z,j,8), a + sbbq b, b + movq a, (z,j,8) + incq j + cmpq k, j + jc bignum_amontifier_corrloop1 + +// This is not exactly the same: we also copy output to t giving the +// initialization t_1 = r == 2^{p+64} mod n for the main loop next. + + xorq c, c + xorq j, j + xorq b, b +bignum_amontifier_dubloop2: + movq (z,j,8), a + shrdq $63, a, c + negq b + sbbq (t,j,8), c + sbbq b, b + movq c, (z,j,8) + movq a, c + incq j + cmpq k, j + jc bignum_amontifier_dubloop2 + shrq $63, c + addq b, c + xorq j, j + xorq b, b +bignum_amontifier_corrloop2: + movq (t,j,8), a + andq c, a + negq b + adcq (z,j,8), a + sbbq b, b + movq a, (z,j,8) + movq a, (t,j,8) + incq j + cmpq k, j + jc bignum_amontifier_corrloop2 + +// We then successively generate (k+1)-digit values satisfying +// t_i == 2^{p+64*i} mod n, each of which is stored in h::t. Finish +// initialization by zeroing h initially + + xorq h, h + +// Then if t_i = 2^{p} * h + l +// we have t_{i+1} == 2^64 * t_i +// = (2^{p+64} * h) + (2^64 * l) +// == r * h + l<<64 +// Do this k more times so we end up == 2^{128*k+64}, one more than we want +// +// Writing B = 2^{64k}, the possible correction of adding r, which for +// a (k+1)-digit result is equivalent to subtracting q = 2^{64*(k+1)} - r +// would give the overall worst-case value minus q of +// [ B * (B^k - 1) + (B - 1) * r ] - [B^{k+1} - r] +// = B * (r - 1) < B^{k+1} so we keep inside k+1 digits as required. +// +// This implementation makes the shift implicit by starting b with the +// "previous" digit (initially 0) to offset things by 1. + + movq k, i +bignum_amontifier_modloop: + xorq b, b + movq k, n + xorq j, j + xorq c, c +bignum_amontifier_cmaloop: + adcq b, c + sbbq l, l + movq (z,j,8), %rax + mulq h + subq l, %rdx + addq c, %rax + movq (t,j,8), b + movq %rax, (t,j,8) + movq %rdx, c + incq j + decq n + jnz bignum_amontifier_cmaloop + adcq c, b + movq b, h + + sbbq l, l + + xorq j, j + xorq c, c +bignum_amontifier_oaloop: + movq (t,j,8), a + movq (z,j,8), b + andq l, b + negq c + adcq b, a + sbbq c, c + movq a, (t,j,8) + incq j + cmpq k, j + jc bignum_amontifier_oaloop + subq c, h + + decq i + jnz bignum_amontifier_modloop + +// Now do one almost-Montgomery reduction w.r.t. the original m +// which lops off one 2^64 from the congruence and, with the usual +// almost-Montgomery correction, gets us back inside k digits for +// the end result. + + movq (m), a + movq a, c + movq a, b + shlq $2, c + subq c, b + xorq $2, b + movq b, c + imulq a, c + movl $2, ashort + addq c, a + addq $1, c + imulq a, b + imulq c, c + movl $1, ashort + addq c, a + imulq a, b + imulq c, c + movl $1, ashort + addq c, a + imulq a, b + imulq c, c + movl $1, ashort + addq c, a + imulq a, b + + movq (t), c + imulq c, b + + movq (m), %rax + mulq b + addq c, %rax + movq %rdx, c + movl $1, jshort + movq k, n + decq n + jz bignum_amontifier_montend + +bignum_amontifier_montloop: + adcq (t,j,8), c + sbbq l, l + movq (m,j,8), %rax + mulq b + subq l, %rdx + addq c, %rax + movq %rax, -8(t,j,8) + movq %rdx, c + incq j + decq n + jnz bignum_amontifier_montloop +bignum_amontifier_montend: + adcq c, h + sbbq l, l + movq h, -8(t,k,8) + + xorq j, j + xorq c, c +bignum_amontifier_osloop: + movq (t,j,8), a + movq (m,j,8), b + andq l, b + negq c + sbbq b, a + sbbq c, c + movq a, (z,j,8) + incq j + cmpq k, j + jc bignum_amontifier_osloop + + bignum_amontifier_end: + popq %r13 + popq %r12 + popq %rbx + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontmul.S new file mode 100644 index 00000000000..d40c8aecfc0 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontmul.S @@ -0,0 +1,249 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Almost-Montgomery multiply, z :== (x * y / 2^{64k}) (congruent mod m) +// Inputs x[k], y[k], m[k]; output z[k] +// +// extern void bignum_amontmul +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); +// +// Does z :== (x * y / 2^{64k}) mod m, meaning that the result, in the native +// size k, is congruent modulo m, but might not be fully reduced mod m. This +// is why it is called *almost* Montgomery multiplication. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontmul) + .text + +// We copy x into %r9 but it comes in in %rdx originally + +#define k %rdi +#define z %rsi +#define x %r9 +#define y %rcx +#define m %r8 + +// General temp, low part of product and mul input +#define a %rax +// General temp, High part of product +#define b %rdx +// Inner loop counter +#define j %rbx +// Home for i'th digit or Montgomery multiplier +#define d %rbp +#define h %r10 +#define e %r11 +#define n %r12 +#define i %r13 +#define c0 %r14 +#define c1 %r15 + +// This one variable we store on the stack as we are a register short. +// At least it's only used once per iteration of the outer loop (k times) +// and with a single read each time, after one initial write. The variable +// is the word-level negated modular inverse + +#define w (%rsp) + +// Some more intuitive names for temp regs in initial word-level negmodinv. + +#define t1 %rbx +#define t2 %rdx + +#define ashort %eax +#define jshort %ebx + + +S2N_BN_SYMBOL(bignum_amontmul): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// Save registers and allocate space on stack for non-register variable w + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $8, %rsp + +// If k = 0 the whole operation is trivial + + testq k, k + jz bignum_amontmul_end + +// Move x input into its permanent home, since we need %rdx for multiplications + + movq %rdx, x + +// Compute word-level negated modular inverse w for m == m[0]. + + movq (m), a + + movq a, t2 + movq a, t1 + shlq $2, t2 + subq t2, t1 + xorq $2, t1 + + movq t1, t2 + imulq a, t2 + movl $2, ashort + addq t2, a + addq $1, t2 + + imulq a, t1 + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, t1 + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, t1 + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, t1 + + movq t1, w + +// Initialize the output c0::z to zero so we can then consistently add rows. +// It would be a bit more efficient to special-case the zeroth row, but +// this keeps the code slightly simpler. + + xorq i, i // Also initializes i for main loop + xorq j, j +bignum_amontmul_zoop: + movq i, (z,j,8) + incq j + cmpq k, j + jc bignum_amontmul_zoop + + xorq c0, c0 + +// Outer loop pulling down digits d=x[i], multiplying by y and reducing + +bignum_amontmul_outerloop: + +// Multiply-add loop where we always have CF + previous high part h to add in. +// Note that in general we do need yet one more carry in this phase and hence +// initialize c1 with the top carry. + + movq (x,i,8), d + xorq j, j + xorq h, h + xorq c1, c1 + movq k, n + +bignum_amontmul_maddloop: + adcq (z,j,8), h + sbbq e, e + movq (y,j,8), a + mulq d + subq e, %rdx + addq h, a + movq a, (z,j,8) + movq %rdx, h + incq j + decq n + jnz bignum_amontmul_maddloop + adcq h, c0 + adcq c1, c1 + +// Montgomery reduction loop, similar but offsetting writebacks + + movq (z), e + movq w, d + imulq e, d + movq (m), a + mulq d + addq e, a // Will be zero but want the carry + movq %rdx, h + movl $1, jshort + movq k, n + decq n + jz bignum_amontmul_montend + +bignum_amontmul_montloop: + adcq (z,j,8), h + sbbq e, e + movq (m,j,8), a + mulq d + subq e, %rdx + addq h, a + movq a, -8(z,j,8) + movq %rdx, h + incq j + decq n + jnz bignum_amontmul_montloop + +bignum_amontmul_montend: + adcq c0, h + adcq $0, c1 + movq c1, c0 + movq h, -8(z,j,8) + +// End of outer loop. + + incq i + cmpq k, i + jc bignum_amontmul_outerloop + +// Now convert carry word, which is always in {0,1}, into a mask "d" +// and do a masked subtraction of m for the final almost-Montgomery result. + + xorq d, d + subq c0, d + xorq e, e + xorq j, j +bignum_amontmul_corrloop: + movq (m,j,8), a + andq d, a + negq e + sbbq a, (z,j,8) + sbbq e, e + incq j + cmpq k, j + jc bignum_amontmul_corrloop + +bignum_amontmul_end: + + addq $8, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontredc.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontredc.S new file mode 100644 index 00000000000..c28d9f3d1f1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontredc.S @@ -0,0 +1,246 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Almost-Montgomery reduce, z :== (x' / 2^{64p}) (congruent mod m) +// Inputs x[n], m[k], p; output z[k] +// +// extern void bignum_amontredc +// (uint64_t k, uint64_t *z, +// uint64_t n, uint64_t *x, uint64_t *m, uint64_t p); +// +// Does a :== (x' / 2^{64p}) mod m where x' = x if n <= p + k and in general +// is the lowest (p+k) digits of x. That is, p-fold almost-Montgomery reduction +// w.r.t. a k-digit modulus m giving a k-digit answer. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x, R8 = m, R9 = p +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x, [RSP+40] = m, [RSP+48] = p +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontredc) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontredc) + .text + +// We copy x into %r10 but it comes in in %rdx originally + +#define k %rdi +#define z %rsi +#define n %r10 +#define x %rcx +#define m %r8 +#define p %r9 + +// General temp, low part of product and mul input +#define a %rax +// General temp, High part of product +#define b %rdx +// Negated modular inverse +#define w (%rsp) +// Inner loop counter +#define j %rbx +// Home for i'th digit or Montgomery multiplier +#define d %rbp +#define h %r11 +#define e %r12 +#define t %r13 +#define i %r14 +#define c %r15 + +// Some more intuitive names for temp regs in initial word-level negmodinv. + +#define t1 %rbx +#define t2 %r14 + +#define ashort %eax +#define cshort %r15d +#define jshort %ebx + + +S2N_BN_SYMBOL(bignum_amontredc): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 + movq 64(%rsp), %r9 +#endif + +// Save registers and allocate space on stack for non-register variable w + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $8, %rsp + +// If k = 0 the whole operation is trivial + + testq k, k + jz bignum_amontredc_end + +// Move n input into its permanent home, since we need %rdx for multiplications + + movq %rdx, n + +// Compute word-level negated modular inverse w for m == m[0]. + + movq (m), a + + movq a, t2 + movq a, t1 + shlq $2, t2 + subq t2, t1 + xorq $2, t1 + + movq t1, t2 + imulq a, t2 + movl $2, ashort + addq t2, a + addq $1, t2 + + imulq a, t1 + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, t1 + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, t1 + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, t1 + + movq t1, w + +// Initialize z to the lowest k digits of the input, zero-padding if n < k. + + movq k, j + cmpq k, n + cmovcq n, j + xorq i, i + testq j, j + jz bignum_amontredc_padloop +bignum_amontredc_copyloop: + movq (x,i,8), a + movq a, (z,i,8) + incq i + cmpq j, i + jc bignum_amontredc_copyloop + + cmpq k, i + jnc bignum_amontredc_initialized + + xorq j, j +bignum_amontredc_padloop: + movq j, (z,i,8) + incq i + cmpq k, i + jc bignum_amontredc_padloop + +bignum_amontredc_initialized: + xorq c, c + +// Now if p = 0 that's the end of the operation + + testq p, p + jz bignum_amontredc_end + +// Outer loop, just doing a standard Montgomery reduction on z + + xorq i, i +bignum_amontredc_outerloop: + movq (z), e + movq w, d + imulq e, d + movq (m), a + mulq d + addq e, a // Will be zero but want the carry + movq %rdx, h + movl $1, jshort + movq k, t + decq t + jz bignum_amontredc_montend + +bignum_amontredc_montloop: + adcq (z,j,8), h + sbbq e, e + movq (m,j,8), a + mulq d + subq e, %rdx + addq h, a + movq a, -8(z,j,8) + movq %rdx, h + incq j + decq t + jnz bignum_amontredc_montloop + +bignum_amontredc_montend: + adcq c, h + movl $0, cshort + adcq $0, c + + addq i, j + cmpq n, j + jnc bignum_amontredc_offtheend + movq (x,j,8), a + addq a, h + adcq $0, c +bignum_amontredc_offtheend: + movq h, -8(z,k,8) + +// End of outer loop. + + incq i + cmpq p, i + jc bignum_amontredc_outerloop + +// Now convert carry word, which is always in {0,1}, into a mask "d" +// and do a masked subtraction of m for the final almost-Montgomery result. + + xorq d, d + subq c, d + xorq e, e + xorq j, j +bignum_amontredc_corrloop: + movq (m,j,8), a + andq d, a + negq e + sbbq a, (z,j,8) + sbbq e, e + incq j + cmpq k, j + jc bignum_amontredc_corrloop + +bignum_amontredc_end: + addq $8, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontsqr.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontsqr.S new file mode 100644 index 00000000000..c6549f7f731 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_amontsqr.S @@ -0,0 +1,236 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Almost-Montgomery square, z :== (x^2 / 2^{64k}) (congruent mod m) +// Inputs x[k], m[k]; output z[k] +// +// extern void bignum_amontsqr +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); +// +// Does z :== (x^2 / 2^{64k}) mod m, meaning that the result, in the native +// size k, is congruent modulo m, but might not be fully reduced mod m. This +// is why it is called *almost* Montgomery squaring. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = m +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = m +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_amontsqr) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_amontsqr) + .text + +// We copy x into %r9 but it comes in in %rdx originally + +#define k %rdi +#define z %rsi +#define x %r9 +#define m %rcx + +// General temp, low part of product and mul input +#define a %rax +// General temp, High part of product +#define b %rdx +// Negated modular inverse +#define w %r8 +// Inner loop counter +#define j %rbx +// Home for i'th digit or Montgomery multiplier +#define d %rbp +#define h %r10 +#define e %r11 +#define n %r12 +#define i %r13 +#define c0 %r14 +#define c1 %r15 + +// A temp reg in the initial word-level negmodinv. + +#define t2 %rdx + +#define ashort %eax +#define jshort %ebx + + +S2N_BN_SYMBOL(bignum_amontsqr): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Save registers + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// If k = 0 the whole operation is trivial + + testq k, k + jz bignum_amontsqr_end + +// Move x input into its permanent home, since we need %rdx for multiplications + + movq %rdx, x + +// Compute word-level negated modular inverse w for m == m[0]. + + movq (m), a + + movq a, t2 + movq a, w + shlq $2, t2 + subq t2, w + xorq $2, w + + movq w, t2 + imulq a, t2 + movl $2, ashort + addq t2, a + addq $1, t2 + + imulq a, w + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, w + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, w + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, w + +// Initialize the output c0::z to zero so we can then consistently add rows. +// It would be a bit more efficient to special-case the zeroth row, but +// this keeps the code slightly simpler. + + xorq i, i // Also initializes i for main loop + xorq j, j +bignum_amontsqr_zoop: + movq i, (z,j,8) + incq j + cmpq k, j + jc bignum_amontsqr_zoop + + xorq c0, c0 + +// Outer loop pulling down digits d=x[i], multiplying by x and reducing + +bignum_amontsqr_outerloop: + +// Multiply-add loop where we always have CF + previous high part h to add in. +// Note that in general we do need yet one more carry in this phase and hence +// initialize c1 with the top carry. + + movq (x,i,8), d + xorq j, j + xorq h, h + xorq c1, c1 + movq k, n + +bignum_amontsqr_maddloop: + adcq (z,j,8), h + sbbq e, e + movq (x,j,8), a + mulq d + subq e, %rdx + addq h, a + movq a, (z,j,8) + movq %rdx, h + incq j + decq n + jnz bignum_amontsqr_maddloop + adcq h, c0 + adcq c1, c1 + +// Montgomery reduction loop, similar but offsetting writebacks + + movq (z), e + movq w, d + imulq e, d + movq (m), a + mulq d + addq e, a // Will be zero but want the carry + movq %rdx, h + movl $1, jshort + movq k, n + decq n + jz bignum_amontsqr_montend + +bignum_amontsqr_montloop: + adcq (z,j,8), h + sbbq e, e + movq (m,j,8), a + mulq d + subq e, %rdx + addq h, a + movq a, -8(z,j,8) + movq %rdx, h + incq j + decq n + jnz bignum_amontsqr_montloop + +bignum_amontsqr_montend: + adcq c0, h + adcq $0, c1 + movq c1, c0 + movq h, -8(z,j,8) + +// End of outer loop. + + incq i + cmpq k, i + jc bignum_amontsqr_outerloop + +// Now convert carry word, which is always in {0,1}, into a mask "d" +// and do a masked subtraction of m for the final almost-Montgomery result. + + xorq d, d + subq c0, d + xorq e, e + xorq j, j +bignum_amontsqr_corrloop: + movq (m,j,8), a + andq d, a + negq e + sbbq a, (z,j,8) + sbbq e, e + incq j + cmpq k, j + jc bignum_amontsqr_corrloop + +bignum_amontsqr_end: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_bitfield.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_bitfield.S new file mode 100644 index 00000000000..6b8e366b511 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_bitfield.S @@ -0,0 +1,124 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Select bitfield starting at bit n with length l <= 64 +// Inputs x[k], n, l; output function return +// +// extern uint64_t bignum_bitfield +// (uint64_t k, uint64_t *x, uint64_t n, uint64_t l); +// +// One-word bitfield from a k-digit (digit=64 bits) bignum, in constant-time +// style. Bitfield starts at bit n and has length l, indexing from 0 (=LSB). +// Digits above the top are treated uniformly as zero, as usual. Since the +// result is returned in a single word, effectively we use l' = min(64,l) +// for the length. +// +// Standard x86-64 ABI: RDI = k, RSI = x, RDX = n, RCX = l, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, R8 = n, R9 = l, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_bitfield) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_bitfield) + .text + +#define k %rdi +#define x %rsi +#define n %rdx +#define l %rcx + +#define d %r8 +#define e %rax +#define i %r9 +#define a %r10 +#define m %r11 + +#define mshort %r11d + + + +S2N_BN_SYMBOL(bignum_bitfield): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Initialize second of digit pair to zero and if length is zero finish +// immediately; the digit e is also the return value in RAX + + xorq e, e + testq k, k + jz bignum_bitfield_end + +// Decompose the index into n = 64 * n + m, then increment n for next part + + movl $63, mshort + andq n, m + shrq $6, n + incq n + +// Run over the digits setting d = n'th and e = (n+1)'th + + xorq i, i +bignum_bitfield_loop: + movq (x,i,8), a + cmpq n, i + cmovcq a, d + cmovzq a, e + incq i + cmpq k, i + jc bignum_bitfield_loop + +// Put zero in a register, for several purposes + + xorq a, a + +// Override d with 0 if we ran off the end (e will retain original 0). + + cmpq n, i + cmovcq a, d + +// Override e if we have m = 0 (i.e. original n was divisible by 64) +// This is because then we want to shift it right by 64 below. + + testq m, m + cmovzq a, e + +// Create a size-l bitmask first (while the shift is conveniently in CL) + + cmpq $64, l + adcq a, a + shlq %cl, a + decq a + +// Combine shifted digits to get the bitfield(n,64) + + movq m, l + shrq %cl, d + negq %rcx + shlq %cl, e + orq d, e + +// Now mask it down to get bitfield (n,l) + + andq a, e + +bignum_bitfield_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_bitsize.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_bitsize.S new file mode 100644 index 00000000000..2d7331a8626 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_bitsize.S @@ -0,0 +1,88 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Return size of bignum in bits +// Input x[k]; output function return +// +// extern uint64_t bignum_bitsize (uint64_t k, uint64_t *x); +// +// In the case of a zero bignum as input the result is 0 +// +// In principle this has a precondition k < 2^58, but obviously that +// is always true in practice because of address space limitations. +// +// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_bitsize) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_bitsize) + .text + +#define k %rdi +#define x %rsi +#define i %rax +#define w %rdx +#define a %rcx +#define j %r8 + + + +S2N_BN_SYMBOL(bignum_bitsize): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Initialize the index i and also prepare default return value of 0 (i = %rax) + + xorq i, i + +// If the bignum is zero-length, just return 0 + + testq k, k + jz bignum_bitsize_end + +// Use w = a[i-1] to store nonzero words in a bottom-up sweep +// Set the initial default to be as if we had a 11...11 word directly below + + movq $-1, w + xorq j, j +bignum_bitsize_loop: + movq (x,j,8), a + incq j + testq a, a + cmovnzq j, i + cmovnzq a, w + cmpq k, j + jnz bignum_bitsize_loop + +// Now w = a[i-1] is the highest nonzero word, or in the zero case the +// default of the "extra" 11...11 = a[0-1]. We now want 64* i - clz(w) = +// 64 * i - (63 - bsr(w)) = (64 * i - 63) + bsr(w). Note that this code +// does not rely on the behavior of the bsr instruction for zero inputs, +// which is undefined. + + shlq $6, i + subq $63, i + bsrq w, w + addq w, %rax + +bignum_bitsize_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cdiv.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cdiv.S new file mode 100644 index 00000000000..49d17e97828 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cdiv.S @@ -0,0 +1,336 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Divide by a single (nonzero) word, z := x / m and return x mod m +// Inputs x[n], m; outputs function return (remainder) and z[k] +// +// extern uint64_t bignum_cdiv +// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m); +// +// Does the "z := x / m" operation where x is n digits, result z is k. +// Truncates the quotient in general, but always (for nonzero m) returns +// the true remainder x mod m. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x, R8 = m +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x, [RSP+40] = m +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cdiv) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cdiv) + .text + +#define k %rdi +#define z %rsi +#define m %r8 + +// These parameters get moved because of special uses for %rcx, %rdx + +#define n %r9 +#define x %r10 + +// This needs to be in %rcx for variable shifts with %cl + +#define e %rcx + +// Other variables + +#define w %r11 +#define d %r12 +#define i %rbx +#define c %r13 +#define l %r14 + +#define a %rax +#define h %rdx + +#define ashort %eax +#define ishort %ebx +#define hshort %edx + +// The remainder + +#define r %r15 + +S2N_BN_SYMBOL(bignum_cdiv): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Move parameters that need a new home + + movq %rdx, n + movq %rcx, x + +// First do a modulus computation, slightly tweaked from bignum_cmod, +// changing variables and avoiding modification of the size parameter. +// Initialize l = 0 now for convenience (we eventually need to do it). +// If the bignum is zero-length, l is already the right answer of 0 + + xorq l, l + testq n, n + jz bignum_cdiv_nomodulus + + bsrq m, e + xorq $63, e + shlq %cl, m + + movq m, r + movq $0x1FFFFFFFFFFFF, w + shrq $16, r + xorq r, w + incq r + shrq $32, w + movq r, h + imulq w, h + negq h + movq h, a + shrq $49, a + imulq a, a + shrq $34, h + addq a, h + orq $0x40000000, a + imulq h, a + shrq $30, a + imulq w, a + shlq $30, w + addq a, w + shrq $30, w + movq r, h + imulq w, h + negq h + shrq $24, h + imulq w, h + shlq $16, w + shrq $24, h + addq h, w + movq r, h + imulq w, h + negq h + shrq $32, h + imulq w, h + shlq $31, w + shrq $17, h + addq h, w + movq m, a + mulq w + shrdq $60, h, a + movq w, h + shrq $33, h + notq a + imulq h, a + shlq $1, w + shrq $33, a + addq a, w + addq $1, w + movq m, a + sbbq $0, w + mulq w + addq m, h + sbbq $0, w + + movq m, r + imulq w, r + negq r + + xorl hshort, hshort + movq n, i +bignum_cdiv_modloop: + movq h, a + mulq r + addq -8(x,i,8), a + adcq l, h + movq a, l + sbbq a, a + andq r, a + addq a, l + adcq $0, h + decq i + jnz bignum_cdiv_modloop + + movq h, i + movq w, a + mulq h + addq i, h + sbbq r, r + andq m, r + + movq h, a + mulq m + addq r, h + xorq r, r + subq a, l + sbbq h, i + + cmovnzq m, r + xorl ashort, ashort + subq r, l + sbbq a, i + + cmovnzq m, a + subq a, l + + movq w, a + mulq l + addq l, h + rcr $1, h + + shrq %cl, m + xorq $63, e + shrq %cl, h + + imulq m, h + subq h, l + + movq l, r + subq m, l +bignum_cdiv_nomodulus: + cmovncq l, r + +// If k = 0 then there's no more to be done + + testq k, k + jz bignum_cdiv_end + +// Let e be the number of trailing zeros in m (we can ignore m = 0) + + bsfq m, e + +// Now just shift m right by e bits. So hereafter we can assume m is odd +// but we first need to shift the input right by e bits then divide by m. + + shrq %cl, m + +// Compute the negated modular inverse w with w * m + 1 == 0 (mod 2^64) +// This is essentially the same as word_negmodinv. + + movq m, a + movq m, w + shlq $2, a + subq a, w + xorq $2, w + movq w, a + imulq m, a + movl $2, hshort + addq a, h + addq $1, a + imulq h, w + imulq a, a + movl $1, hshort + addq a, h + imulq h, w + imulq a, a + movl $1, hshort + addq a, h + imulq h, w + imulq a, a + movl $1, hshort + addq a, h + imulq h, w + +// We have the remainder r, so now x = m * y + r for some quotient y +// to be computed. Consider x' = x + (m - r) = m * (y + 1) and do a +// Montgomery reduction, keeping the cofactor z. This gives us +// x' + m * z = 2^{64k} * c where c <= m. Thus since x' = m * (y + 1) +// we have +// +// m * (y + z + 1) = 2^{64k} * c +// +// This means m * (y + z + 1) == 0 (mod 2^{64k}), even when we truncate +// x to k digits (if in fact k < n). Since m is odd, it's coprime to +// 2^{64k} so we can cancel and get y + z + 1 == 0 (mod 2^{64k}), and +// hence using logical complement y == ~z (mod 2^{64k}). Thus we can +// write back the logical complements of the cofactor as the answer. +// Start with carry word c = m - r/2^e to make the initial tweak +// x' = x + (m - r); since we've shifted everything initially by e +// we need to shift the remainder too before subtracting from the +// shifted m. + + movq r, d + shrq %cl, d + movq m, c + subq d, c + xorl ishort, ishort + +// Unless n = 0, preload the zeroth digit and bump up the x pointer by +// 8 and n down by 1, to ease indexing and comparison using the same +// variable i in the main loop. When n = 0 we leave it alone, as the +// comparison i < n will always fail and the x pointer is unused. + + xorq d, d + testq n, n + jz bignum_cdiv_loop + movq (x), d + addq $8, x + decq n + +bignum_cdiv_loop: + +// Load the next digit up to get [l,d] then shift right e places + + xorq l, l + cmpq n, i + jnc bignum_cdiv_noload + movq (x,i,8), l +bignum_cdiv_noload: + shrdq %cl, l, d + addq c, d + sbbq c, c + negq c + +// Now the effective sum is [c,a] where the carry-in has been absorbed. +// Do the main Montgomery step with the (odd) m, writing back ~q. Finally +// set d to the next digit ready for the following iteration. + + movq w, a + imulq d, a + notq a + movq a, (z,i,8) + notq a + + mulq m + addq d, a + adcq h, c + + movq l, d + + incq i + cmpq k, i + jc bignum_cdiv_loop + +// Return the modulus + +bignum_cdiv_end: + movq r, %rax + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cdiv_exact.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cdiv_exact.S new file mode 100644 index 00000000000..98cfa63b70f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cdiv_exact.S @@ -0,0 +1,193 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Divide by a single word, z := x / m *when known to be exact* +// Inputs x[n], m; output z[k] +// +// extern void bignum_cdiv_exact +// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t m); +// +// Does the "z := x / m" operation where x is n digits and result z is k, +// *assuming* that m is nonzero and that the input x is in fact an +// exact multiple of m. (If this isn't known, use the general bignum_cdiv +// function instead.) In general the result is truncated to k digits. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x, R8 = m +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x, [RSP+40] = m +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cdiv_exact) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cdiv_exact) + .text + +#define k %rdi +#define z %rsi +#define m %r8 + +// These parameters get moved because of special uses for %rcx, %rdx + +#define n %r9 +#define x %r10 + +// This needs to be in %rcx for variable shifts with %cl + +#define e %rcx + +// Other variables + +#define w %r11 +#define d %r12 +#define i %rbx +#define c %r13 +#define t %r14 + +#define a %rax +#define h %rdx + +#define ishort %ebx +#define hshort %edx + +S2N_BN_SYMBOL(bignum_cdiv_exact): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + +// If k = 0 then there's nothing to be done + + testq k, k + jz bignum_cdiv_exact_end + +// Move parameters that need a new home + + movq %rdx, n + movq %rcx, x + +// Let e be the number of trailing zeros in m (we can ignore m = 0) + + bsfq m, e + +// Now just shift m right by e bits. So hereafter we can assume m is odd +// but we first need to shift the input right by e bits then divide by m. + + shrq %cl, m + +// Compute the negated modular inverse w with w * m + 1 == 0 (mod 2^64) +// This is essentially the same as word_negmodinv. + + movq m, a + movq m, w + shlq $2, a + subq a, w + xorq $2, w + movq w, a + imulq m, a + movl $2, hshort + addq a, h + addq $1, a + imulq h, w + imulq a, a + movl $1, hshort + addq a, h + imulq h, w + imulq a, a + movl $1, hshort + addq a, h + imulq h, w + imulq a, a + movl $1, hshort + addq a, h + imulq h, w + +// Consider x' = x + m and do a Montgomery reduction, keeping the cofactor z. +// This gives us x' + m * z = 2^{64k} * c where c <= m. Assuming x = m * y +// we then have m * y + m + m * z = 2^{64k} * c, i.e. +// +// m * (y + z + 1) = 2^{64k} * c +// +// This means m * (y + z + 1) == 0 (mod 2^{64k}), even when we truncate +// x to k digits (if in fact k < n). Since m is odd, it's coprime to +// 2^{64k} so we can cancel and get y + z + 1 == 0 (mod 2^{64k}), and +// hence using logical complement y == ~z (mod 2^{64k}). Thus we can +// write back the logical complements of the cofactor as the answer. +// Start with carry word c = m to make the initial tweak x' = x + m. + + movq m, c + xorl ishort, ishort + +// Unless n = 0, preload the zeroth digit and bump up the x pointer by +// 8 and n down by 1, to ease indexing and comparison using the same +// variable i in the main loop. When n = 0 we leave it alone, as the +// comparison i < n will always fail and the x pointer is unused. + + xorq d, d + testq n, n + jz bignum_cdiv_exact_loop + movq (x), d + addq $8, x + decq n + +bignum_cdiv_exact_loop: + +// Load the next digit up to get [t,d] then shift right e places + + xorq t, t + cmpq n, i + jnc bignum_cdiv_exact_noload + movq (x,i,8), t +bignum_cdiv_exact_noload: + shrdq %cl, t, d + addq c, d + sbbq c, c + negq c + +// Now the effective sum is [c,a] where the carry-in has been absorbed. +// Do the main Montgomery step with the (odd) m, writing back ~q. Finally +// set d to the next digit ready for the following iteration. + + movq w, a + imulq d, a + notq a + movq a, (z,i,8) + notq a + + mulq m + addq d, a + adcq h, c + + movq t, d + + incq i + cmpq k, i + jc bignum_cdiv_exact_loop + +bignum_cdiv_exact_end: + popq %r14 + popq %r13 + popq %r12 + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cld.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cld.S new file mode 100644 index 00000000000..a3581a6b7a5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cld.S @@ -0,0 +1,73 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count leading zero digits (64-bit words) +// Input x[k]; output function return +// +// extern uint64_t bignum_cld (uint64_t k, uint64_t *x); +// +// In the case of a zero bignum as input the result is k +// +// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cld) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cld) + .text + +#define k %rdi +#define x %rsi +#define i %rax +#define a %rcx +#define j %rdx + + + +S2N_BN_SYMBOL(bignum_cld): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Initialize the index i and also prepare default return value of 0 (i = %rax) + + xorq i, i + +// If the bignum is zero-length, just return k = 0 + + testq k, k + jz bignum_cld_end + +// Run over the words j = 0..i-1, and set i := j + 1 when hitting nonzero a[j] + + xorq j, j +bignum_cld_loop: + movq (x,j,8), a + incq j + testq a, a + cmovnzq j, i + cmpq k, j + jnz bignum_cld_loop + + negq %rax + addq %rdi, %rax + +bignum_cld_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_clz.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_clz.S new file mode 100644 index 00000000000..f4014f7e35e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_clz.S @@ -0,0 +1,88 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count leading zero bits +// Input x[k]; output function return +// +// extern uint64_t bignum_clz (uint64_t k, uint64_t *x); +// +// In the case of a zero bignum as input the result is 64 * k +// +// In principle this has a precondition k < 2^58, but obviously that +// is always true in practice because of address space limitations +// +// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_clz) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_clz) + .text + +#define k %rdi +#define x %rsi +#define i %rax +#define w %rdx +#define a %rcx +#define j %r8 + + + +S2N_BN_SYMBOL(bignum_clz): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Initialize the index i and also prepare default return value of 0 (i = %rax) + + xorq i, i + +// If the bignum is zero-length, just return 0 + + testq k, k + jz bignum_clz_end + +// Use w = a[i-1] to store nonzero words in a bottom-up sweep +// Set the initial default to be as if we had a 11...11 word directly below + + movq $-1, w + xorq j, j +bignum_clz_loop: + movq (x,j,8), a + incq j + testq a, a + cmovnzq j, i + cmovnzq a, w + cmpq k, j + jnz bignum_clz_loop + +// Now w = a[i-1] is the highest nonzero word, or in the zero case the +// default of the "extra" 11...11 = a[0-1]. We now want 64*(k - i) + clz(w) = +// 64*(k - i) + (63 - bsr(w)). Note that this code does not rely on the +// behavior of the bsr instruction for zero inputs, where it is undefined + + subq i, k + shlq $6, k + bsrq w, %rax + xorq $63, %rax + addq k, %rax + +bignum_clz_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmadd.S new file mode 100644 index 00000000000..d423ebb00c8 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmadd.S @@ -0,0 +1,144 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply-add with single-word multiplier, z := z + c * y +// Inputs c, y[n]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_cmadd +// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); +// +// Does the "z := z + c * y" operation where y is n digits, result z is p. +// Truncates the result in general. +// +// The return value is a high/carry word that is meaningful when p = n + 1, or +// more generally when n <= p and the result fits in p + 1 digits. In these +// cases it gives the top digit of the (p + 1)-digit result. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmadd) + .text + +#define p %rdi +#define z %rsi +#define c %r9 +#define n %rcx +#define x %r8 + +#define i %r10 +#define h %r11 + +#define r %rbx + +#define hshort %r11d +#define ishort %r10d + + + +S2N_BN_SYMBOL(bignum_cmadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// Seems hard to avoid one more register + + pushq %rbx + +// First clamp the input size n := min(p,n) since we can never need to read +// past the p'th term of the input to generate p-digit output. +// Subtract p := p - min(n,p) so it holds the size of the extra tail needed + + cmpq n, p + cmovcq p, n + subq n, p + +// Initialize high part h = 0; if n = 0 do nothing but return that zero + + xorq h, h + testq n, n + jz bignum_cmadd_end + +// Move c into a safer register as multiplies overwrite %rdx + + movq %rdx, c + +// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * x_0 + + movq (x), %rax + mulq c + addq %rax, (z) + movq %rdx, h + movl $1, ishort + decq n + jz bignum_cmadd_hightail + +// Main loop, where we always have CF + previous high part h to add in + +bignum_cmadd_loop: + adcq (z,i,8), h + sbbq r, r + movq (x,i,8), %rax + mulq c + subq r, %rdx + addq h, %rax + movq %rax, (z,i,8) + movq %rdx, h + incq i + decq n + jnz bignum_cmadd_loop + +bignum_cmadd_hightail: + adcq $0, h + +// Propagate the carry all the way to the end with h as extra carry word + +bignum_cmadd_tail: + testq p, p + jz bignum_cmadd_end + + addq h, (z,i,8) + movl $0, hshort + incq i + decq p + jz bignum_cmadd_highend + +bignum_cmadd_tloop: + adcq h, (z,i,8) + incq i + decq p + jnz bignum_cmadd_tloop + +bignum_cmadd_highend: + + adcq $0, h + +// Return the high/carry word + +bignum_cmadd_end: + movq h, %rax + + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmnegadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmnegadd.S new file mode 100644 index 00000000000..cc9e80ccfd0 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmnegadd.S @@ -0,0 +1,154 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Negated multiply-add with single-word multiplier, z := z - c * y +// Inputs c, y[n]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_cmnegadd +// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); +// +// Does the "z := z - c * y" operation where y is n digits, result z is p. +// Truncates the result in general. +// +// The return value is a high/carry word that is meaningful when n <= p. +// It is interpreted negatively as z' - 2^{64k} * return = z - c * y. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmnegadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmnegadd) + .text + +#define p %rdi +#define z %rsi +#define c %r9 +#define n %rcx +#define x %r8 + +#define i %r10 +#define h %r11 + +#define r %rbx + +#define hshort %r11d +#define ishort %r10d + + + +S2N_BN_SYMBOL(bignum_cmnegadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// Seems hard to avoid one more register + + pushq %rbx + +// First clamp the input size n := min(p,n) since we can never need to read +// past the p'th term of the input to generate p-digit output. +// Subtract p := p - min(n,p) so it holds the size of the extra tail needed + + cmpq n, p + cmovcq p, n + subq n, p + +// Initialize high part h = 0; if n = 0 do nothing but return that zero + + xorq h, h + testq n, n + jz bignum_cmnegadd_end + +// Move c into a safer register as multiplies overwrite %rdx + + movq %rdx, c + +// Initialization of the loop: 2^64 * CF + [h,z_0'] = z_0 + c * ~x_0 + c + + movq (x), %rax + notq %rax + mulq c + addq c, %rax + adcq $0, %rdx + addq %rax, (z) + movq %rdx, h + movl $1, ishort + decq n + jz bignum_cmnegadd_tail + +// Main loop, where we always have CF + previous high part h to add in + +bignum_cmnegadd_loop: + adcq (z,i,8), h + sbbq r, r + movq (x,i,8), %rax + notq %rax + mulq c + subq r, %rdx + addq h, %rax + movq %rax, (z,i,8) + movq %rdx, h + incq i + decq n + jnz bignum_cmnegadd_loop + +// At this point we have 2^{64n} * (h + CF) + z' = z + c * (2^{64n} - x) +// so z' - 2^{64n} * (c - (h + CF)) = z - c * x. +// Since z - c * x < 2^{64n} we must have c - (h + CF) >= 0. +// Accumulate the negative carry in h for consistency with trivial cases. + +bignum_cmnegadd_tail: + sbbq h, c + movq c, h + +// Propagate the carry all the way to the end with h as extra carry word + + testq p, p + jz bignum_cmnegadd_end + + subq h, (z,i,8) + movl $0, hshort + incq i + decq p + jz bignum_cmnegadd_highend + +bignum_cmnegadd_tloop: + sbbq h, (z,i,8) + incq i + decq p + jnz bignum_cmnegadd_tloop + +bignum_cmnegadd_highend: + +// Adjust the high word with the carry from subtraction + + adcq $0, h + +// Return the high/carry word + +bignum_cmnegadd_end: + movq h, %rax + + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmod.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmod.S new file mode 100644 index 00000000000..91aa3f4d828 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmod.S @@ -0,0 +1,223 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Find bignum modulo a single word +// Input x[k], m; output function return +// +// extern uint64_t bignum_cmod (uint64_t k, uint64_t *x, uint64_t m); +// +// Returns x mod m, assuming m is nonzero. +// +// Standard x86-64 ABI: RDI = k, RSI = x, RDX = m, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, R8 = m, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmod) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmod) + .text + +#define k %rdi +#define x %rsi + +// This has to be %rcx for variable shifts + +#define e %rcx + +// We share the same variable for m and n, just shifting left then right. +// And h is kept in %rdx which does work despite the special operands of mul. + +#define m %r8 +#define n %r8 + +#define w %r9 +#define a %rax +#define r %r10 +#define h %rdx +#define l %r11 + +#define ashort %eax +#define hshort %edx + +S2N_BN_SYMBOL(bignum_cmod): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Initialize l = 0 now for convenience (we eventually need to do it). +// If the bignum is zero-length, l is already the right answer of 0 + + xorq l, l + testq k, k + jz bignum_cmod_end + +// Move m into its permanent home (also used for n). +// Find number of leading zeros of m and let n = 2^e m so that for an +// in-scope (nonzero) input m we have n >= 2^63, e <= 63. + + movq %rdx, m + bsrq m, e + xorq $63, e + shlq %cl, m + +// A near-clone of word_recip so 2^64 + w = ceil(2^128 / n) - 1 + + movq n, r + movq $0x1FFFFFFFFFFFF, w + shrq $16, r + xorq r, w + incq r + shrq $32, w + movq r, h + imulq w, h + negq h + movq h, a + shrq $49, a + imulq a, a + shrq $34, h + addq a, h + orq $0x40000000, a + imulq h, a + shrq $30, a + imulq w, a + shlq $30, w + addq a, w + shrq $30, w + movq r, h + imulq w, h + negq h + shrq $24, h + imulq w, h + shlq $16, w + shrq $24, h + addq h, w + movq r, h + imulq w, h + negq h + shrq $32, h + imulq w, h + shlq $31, w + shrq $17, h + addq h, w + movq n, a + mulq w + shrdq $60, h, a + movq w, h + shrq $33, h + notq a + imulq h, a + shlq $1, w + shrq $33, a + addq a, w + addq $1, w + movq n, a + sbbq $0, w + mulq w + addq n, h + sbbq $0, w + +// Take the residue r = 2^128 - (2^64 + w) * n, which by the above bound +// we know fits in 64 bits. We know 2^128 == r (mod n) and hence (mod m). + + movq n, r + imulq w, r + negq r + +// Now just go down through the digits accumulating [h;l] == x (mod n) +// by 2^64 * [h;l] + d = 2^128 * h + [l;d] == r * h + [l; d]. That addition +// may overflow with a carry, say 2^128 + [h';l'] = r * h + [l; d], in +// which case we subtract 2^128 - r (which is divisible by m and keeping +// things in 128 bits we just add r). Thus the overall bound when we initially +// overflow is r * h + [l; d] - (2^128 - r) = r * (h + 1) + [l; d] - 2^128 +// < 2^128 so we stay inside 2 words + + xorl hshort, hshort +bignum_cmod_loop: + movq h, a + mulq r + addq -8(x,k,8), a + adcq l, h + movq a, l + sbbq a, a + andq r, a + addq a, l + adcq $0, h + decq k + jnz bignum_cmod_loop + +// Now do reciprocal multiplication to reduce the 2-word modular equivalent +// [h;l] to the single word l. If we assume the truncations are as follows +// 2^64 + w = 2^128 / n - epsilon (0 <= epsilon <= 1) +// q = (w * h / 2^64) - delta (0 <= delta <= 1) +// the net remainder is l + (h/2^64 * epsilon + delta) * n < l + 2 * n. +// In general this needs two rounds of comparison to guarantee getting +// into a single word (though one more mul could be used instead). +// Also, the quotient estimate can overflow so we use r as extra addend +// 2^64 * n when the initial addition overflows. The overall multiple +// of n can't itself overflow, since we know it's an underestimate of +// the initial residue. + + movq h, k // back up h for muls + movq w, a + mulq h + addq k, h + sbbq r, r + andq n, r // So q = (r;h) + + movq h, a + mulq n + addq r, h + xorq r, r + subq a, l + sbbq h, k // (k,l) = first reduction + + cmovnzq n, r + xorl ashort, ashort + subq r, l + sbbq a, k + + cmovnzq n, a + subq a, l + +// One more reciprocal multiplication to do a modular reduction, but now in +// one word and in terms of the original m. For the quotient estimate we want +// q = ((2^64 + w) * l) / 2^{128-e} = ((2^64 + w) * l) / 2^65 / 2^{63-e}. + + movq w, a + mulq l + addq l, h + rcr $1, h + + shrq %cl, m + xorq $63, e + shrq %cl, h + + imulq m, h + subq h, l + +// Note that since there is no neglected "low" part of the single word, +// one round of correction suffices; in the analog of the above l = 0 +// and hence the residue so far is already < 2 * m. + + movq l, a + subq m, l +bignum_cmod_end: + cmovncq l, a +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmul.S new file mode 100644 index 00000000000..3a936011e17 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_cmul.S @@ -0,0 +1,127 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word, z := c * y +// Inputs c, y[n]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_cmul +// (uint64_t k, uint64_t *z, uint64_t c, uint64_t n, uint64_t *y); +// +// Does the "z := c * y" operation where y is n digits, result z is p. +// Truncates the result in general unless p >= n + 1. +// +// The return value is a high/carry word that is meaningful when p >= n as +// giving the high part of the result. Since this is always zero if p > n, +// it is mainly of interest in the special case p = n, i.e. where the source +// and destination have the same nominal size, when it gives the extra word +// of the full result. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = c, RCX = n, R8 = y, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = c, R9 = n, [RSP+40] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul) + .text + +#define p %rdi +#define z %rsi +#define c %r9 +#define n %rcx +#define x %r8 + +#define i %r10 +#define h %r11 + + + +S2N_BN_SYMBOL(bignum_cmul): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// First clamp the input size n := min(p,n) since we can never need to read +// past the p'th term of the input to generate p-digit output. Now we can +// assume that n <= p + + cmpq n, p + cmovcq p, n + +// Initialize current input/output pointer offset i and high part h. +// But then if n = 0 skip the multiplication and go to the tail part + + xorq h, h + xorq i, i + testq n, n + jz bignum_cmul_tail + +// Move c into a safer register as multiplies overwrite %rdx + + movq %rdx, c + +// Initialization of the loop: [h,l] = c * x_0 + + movq (x), %rax + mulq c + movq %rax, (z) + movq %rdx, h + incq i + cmpq n, i + jz bignum_cmul_tail + +// Main loop doing the multiplications + +bignum_cmul_loop: + movq (x,i,8), %rax + mulq c + addq h, %rax + adcq $0, %rdx + movq %rax, (z,i,8) + movq %rdx, h + incq i + cmpq n, i + jc bignum_cmul_loop + +// Add a tail when the destination is longer + +bignum_cmul_tail: + cmpq p, i + jnc bignum_cmul_end + movq h, (z,i,8) + xorq h, h + incq i + cmpq p, i + jnc bignum_cmul_end + +bignum_cmul_tloop: + movq h, (z,i,8) + incq i + cmpq p, i + jc bignum_cmul_tloop + +// Return the high/carry word + +bignum_cmul_end: + movq h, %rax + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_coprime.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_coprime.S new file mode 100644 index 00000000000..442b84b8c15 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_coprime.S @@ -0,0 +1,518 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Test bignums for coprimality, gcd(x,y) = 1 +// Inputs x[m], y[n]; output function return; temporary buffer t[>=2*max(m,n)] +// +// extern uint64_t bignum_coprime +// (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y, uint64_t *t); +// +// Test for whether two bignums are coprime (no common factor besides 1). +// This is equivalent to testing if their gcd is 1, but a bit faster than +// doing those two computations separately. +// +// Here bignum x is m digits long, y is n digits long and the temporary +// buffer t needs to be 2 * max(m,n) digits long. The return value is +// 1 if coprime(x,y) and 0 otherwise. +// +// Standard x86-64 ABI: RDI = m, RSI = x, RDX = n, RCX = y, R8 = t, returns RAX +// Microsoft x64 ABI: RCX = m, RDX = x, R8 = n, R9 = y, [RSP+40] = t, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_coprime) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_coprime) + .text + +// We get CHUNKSIZE bits per outer iteration, 64 minus a bit for proxy errors + +#define CHUNKSIZE 58 + +// These variables are so fundamental we keep them consistently in registers. +// m is in fact the temporary buffer argument w so use the same register + +#define m %r8 +#define n %r15 +#define k %r14 +#define l %r13 + +// These are kept on the stack since there aren't enough registers + +#define mat_mm (%rsp) +#define mat_mn 8(%rsp) +#define mat_nm 16(%rsp) +#define mat_nn 24(%rsp) +#define t 32(%rsp) +#define evenor 40(%rsp) + +#define STACKVARSIZE 48 + +// These are shorthands for common temporary register + +#define a %rax +#define b %rbx +#define c %rcx +#define d %rdx +#define i %r9 + +// Temporaries for the top proxy selection part + +#define c1 %r10 +#define c2 %r11 +#define h1 %r12 +#define h2 %rbp +#define l1 %rdi +#define l2 %rsi + +// Re-use for the actual proxies; m_hi = h1 and n_hi = h2 are assumed + +#define m_hi %r12 +#define n_hi %rbp +#define m_lo %rdi +#define n_lo %rsi + +// Re-use for the matrix entries in the inner loop, though they +// get spilled to the corresponding memory locations mat_... + +#define m_m %r10 +#define m_n %r11 +#define n_m %rcx +#define n_n %rdx + +#define ishort %r9d +#define m_mshort %r10d +#define m_nshort %r11d +#define n_mshort %ecx +#define n_nshort %edx + +// Because they are so unmemorable + +#define arg1 %rdi +#define arg2 %rsi +#define arg3 %rdx +#define arg4 %rcx + +S2N_BN_SYMBOL(bignum_coprime): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// Save all required registers and make room on stack for all the above vars + + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $STACKVARSIZE, %rsp + +// Compute k = max(m,n), and if this is zero skip to the end. Note that +// in this case k is also in %rax so serves as the right answer of "false" + + movq arg1, %rax + cmpq arg3, %rax + cmovcq arg3, %rax + movq %rax, k + + testq %rax, %rax + jz bignum_coprime_end + +// Set up inside w two size-k buffers m and n + + leaq (m,k,8), n + +// Copy the input x into the buffer m, padding with zeros as needed + + xorq i, i + testq arg1, arg1 + jz bignum_coprime_xpadloop +bignum_coprime_xloop: + movq (arg2,i,8), a + movq a, (m,i,8) + incq i + cmpq arg1, i + jc bignum_coprime_xloop + cmpq k, i + jnc bignum_coprime_xskip +bignum_coprime_xpadloop: + movq $0, (m,i,8) + incq i + cmpq k, i + jc bignum_coprime_xpadloop +bignum_coprime_xskip: + +// Copy the input y into the buffer n, padding with zeros as needed + + xorq i, i + testq arg3, arg3 + jz bignum_coprime_ypadloop +bignum_coprime_yloop: + movq (arg4,i,8), a + movq a, (n,i,8) + incq i + cmpq arg3, i + jc bignum_coprime_yloop + cmpq k, i + jnc bignum_coprime_yskip +bignum_coprime_ypadloop: + movq $0, (n,i,8) + incq i + cmpq k, i + jc bignum_coprime_ypadloop +bignum_coprime_yskip: + +// Set up the outer loop count of 64 * sum of input sizes. +// The invariant is that m * n < 2^t at all times. + + leaq (arg1,arg3), a + shlq $6, a + movq a, t + +// Record for the very end the OR of the lowest words. +// If the bottom bit is zero we know both are even so the answer is false. +// But since this is constant-time code we still execute all the main part. + + movq (m), a + movq (n), b + orq b, a + movq a, evenor + +// Now if n is even trigger a swap of m and n. This ensures that if +// one or other of m and n is odd then we make sure now that n is, +// as expected by our invariant later on. + + andq $1, b + subq $1, b + + xorq i, i +bignum_coprime_swaploop: + movq (m,i,8), a + movq (n,i,8), c + movq a, d + xorq c, d + andq b, d + xorq d, a + xorq d, c + movq a, (m,i,8) + movq c, (n,i,8) + incq i + cmpq k, i + jnz bignum_coprime_swaploop + +// Start of the main outer loop iterated t / CHUNKSIZE times + +bignum_coprime_outerloop: + +// We need only bother with sharper l = min k (ceil(t/64)) digits +// Either both m and n fit in l digits, or m has become zero and so +// nothing happens in the loop anyway and this makes no difference. + + movq t, l + addq $63, l + shrq $6, l + cmpq k, l + cmovncq k, l + +// Select upper and lower proxies for both m and n to drive the inner +// loop. The lower proxies are simply the lowest digits themselves, +// m_lo = m[0] and n_lo = n[0], while the upper proxies are bitfields +// of the two inputs selected so their top bit (63) aligns with the +// most significant bit of *either* of the two inputs. + + xorq h1, h1 // Previous high and low for m + xorq l1, l1 + xorq h2, h2 // Previous high and low for n + xorq l2, l2 + xorq c2, c2 // Mask flag: previous word of one was nonzero + // and in this case h1 and h2 are those words + + xorq i, i +bignum_coprime_toploop: + movq (m,i,8), b + movq (n,i,8), c + movq c2, c1 + andq h1, c1 + andq h2, c2 + movq b, a + orq c, a + negq a + cmovcq c1, l1 + cmovcq c2, l2 + cmovcq b, h1 + cmovcq c, h2 + sbbq c2, c2 + incq i + cmpq l, i + jc bignum_coprime_toploop + + movq h1, a + orq h2, a + bsrq a, c + xorq $63, c + shldq %cl, l1, h1 + shldq %cl, l2, h2 + +// m_lo = m[0], n_lo = n[0]; + + movq (m), %rax + movq %rax, m_lo + + movq (n), %rax + movq %rax, n_lo + +// Now the inner loop, with i as loop counter from CHUNKSIZE down. +// This records a matrix of updates to apply to the initial +// values of m and n with, at stage j: +// +// sgn * m' = (m_m * m - m_n * n) / 2^j +// -sgn * n' = (n_m * m - n_n * n) / 2^j +// +// where "sgn" is either +1 or -1, and we lose track of which except +// that both instance above are the same. This throwing away the sign +// costs nothing (since we have to correct in general anyway because +// of the proxied comparison) and makes things a bit simpler. But it +// is simply the parity of the number of times the first condition, +// used as the swapping criterion, fires in this loop. + + movl $1, m_mshort + movl $0, m_nshort + movl $0, n_mshort + movl $1, n_nshort + movl $CHUNKSIZE, ishort + +// Stash more variables over the inner loop to free up regs + + movq k, mat_mn + movq l, mat_nm + movq m, mat_mm + movq n, mat_nn + +// Conceptually in the inner loop we follow these steps: +// +// * If m_lo is odd and m_hi < n_hi, then swap the four pairs +// (m_hi,n_hi); (m_lo,n_lo); (m_m,n_m); (m_n,n_n) +// +// * Now, if m_lo is odd (old or new, doesn't matter as initial n_lo is odd) +// m_hi := m_hi - n_hi, m_lo := m_lo - n_lo +// m_m := m_m + n_m, m_n := m_n + n_n +// +// * Halve and double them +// m_hi := m_hi / 2, m_lo := m_lo / 2 +// n_m := n_m * 2, n_n := n_n * 2 +// +// The actual computation computes updates before actually swapping and +// then corrects as needed. + +bignum_coprime_innerloop: + + xorl %eax, %eax + xorl %ebx, %ebx + xorq m, m + xorq n, n + btq $0, m_lo + + cmovcq n_hi, %rax + cmovcq n_lo, %rbx + cmovcq n_m, m + cmovcq n_n, n + + movq m_lo, l + subq %rbx, m_lo + subq l, %rbx + movq m_hi, k + subq %rax, k + cmovcq m_hi, n_hi + leaq -1(k), m_hi + cmovcq %rbx, m_lo + cmovcq l, n_lo + notq m_hi + cmovcq m_m, n_m + cmovcq m_n, n_n + cmovncq k, m_hi + + shrq $1, m_lo + addq m, m_m + addq n, m_n + shrq $1, m_hi + addq n_m, n_m + addq n_n, n_n + +// End of the inner for-loop + + decq i + jnz bignum_coprime_innerloop + +// Unstash the temporary variables + + movq mat_mn, k + movq mat_nm, l + movq mat_mm, m + movq mat_nn, n + +// Put the matrix entries in memory since we're out of registers +// We pull them out repeatedly in the next loop + + movq m_m, mat_mm + movq m_n, mat_mn + movq n_m, mat_nm + movq n_n, mat_nn + +// Now actually compute the updates to m and n corresponding to that matrix, +// and correct the signs if they have gone negative. First we compute the +// (k+1)-sized updates with the following invariant (here h1 and h2 are in +// fact carry bitmasks, either 0 or -1): +// +// h1::l1::m = m_m * m - m_n * n +// h2::l2::n = n_m * m - n_n * n + + xorq i, i + xorq h1, h1 + xorq l1, l1 + xorq h2, h2 + xorq l2, l2 +bignum_coprime_crossloop: + + movq (m,i,8), c + movq mat_mm, a + mulq c + addq a, l1 + adcq $0, d + movq d, c1 // Now c1::l1 is +ve part 1 + + movq mat_nm, a + mulq c + addq a, l2 + adcq $0, d + movq d, c2 // Now c2::l2 is +ve part 2 + + movq (n,i,8), c + movq mat_mn, a + mulq c + subq h1, d // Now d::a is -ve part 1 + + subq a, l1 + sbbq d, c1 + sbbq h1, h1 + movq l1, (m,i,8) + movq c1, l1 + + movq mat_nn, a + mulq c + subq h2, d // Now d::a is -ve part 2 + + subq a, l2 + sbbq d, c2 + sbbq h2, h2 + movq l2, (n,i,8) + movq c2, l2 + + incq i + cmpq l, i + jc bignum_coprime_crossloop + +// Now fix the signs of m and n if they have gone negative + + xorq i, i + movq h1, c1 // carry-in coded up as well + movq h2, c2 // carry-in coded up as well + xorq h1, l1 // for the bignum_coprime_end digit + xorq h2, l2 // for the bignum_coprime_end digit +bignum_coprime_optnegloop: + movq (m,i,8), a + xorq h1, a + negq c1 + adcq $0, a + sbbq c1, c1 + movq a, (m,i,8) + movq (n,i,8), a + xorq h2, a + negq c2 + adcq $0, a + sbbq c2, c2 + movq a, (n,i,8) + incq i + cmpq l, i + jc bignum_coprime_optnegloop + subq c1, l1 + subq c2, l2 + +// Now shift them right CHUNKSIZE bits + + movq l, i +bignum_coprime_shiftloop: + movq -8(m,i,8), a + movq a, h1 + shrdq $CHUNKSIZE, l1, a + movq a, -8(m,i,8) + movq h1, l1 + movq -8(n,i,8), a + movq a, h2 + shrdq $CHUNKSIZE, l2, a + movq a, -8(n,i,8) + movq h2, l2 + decq i + jnz bignum_coprime_shiftloop + +// End of main loop. We can stop if t' <= 0 since then m * n < 2^0, which +// since n is odd (in the main cases where we had one or other input odd) +// means that m = 0 and n is the final gcd. Moreover we do in fact need to +// maintain strictly t > 0 in the main loop, or the computation of the +// optimized digit bound l could collapse to 0. + + subq $CHUNKSIZE, t + jnbe bignum_coprime_outerloop + +// Now compare n with 1 (OR of the XORs in a) + + movq (n), a + xorq $1, a + cmpq $1, k + jz bignum_coprime_finalcomb + movl $1, ishort +bignum_coprime_compareloop: + orq (n,i,8), a + incq i + cmpq k, i + jc bignum_coprime_compareloop + +// Now combine that with original "evenor" oddness flag +// The final condition is lsb(evenor) = 1 AND a = 0 + +bignum_coprime_finalcomb: + negq a + sbbq a, a + incq a + andq evenor, a + +// The end + +bignum_coprime_end: + addq $STACKVARSIZE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_copy.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_copy.S new file mode 100644 index 00000000000..50d7906e7f4 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_copy.S @@ -0,0 +1,83 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Copy bignum with zero-extension or truncation, z := x +// Input x[n]; output z[k] +// +// extern void bignum_copy +// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x); +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_copy) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_copy) + .text + +#define k %rdi +#define z %rsi +#define n %rdx +#define x %rcx + +#define i %r8 +#define a %rax + + + +S2N_BN_SYMBOL(bignum_copy): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Replace RDX = n with RDX = min(k,n) so we are definitely safe copying those +// Initialize the element counter to 0 + + cmpq n, k + cmovcq k, n + xorq i, i + +// If min(k,n) = 0 jump to the padding stage + + testq n, n + jz bignum_copy_padding + +bignum_copy_copyloop: + movq (x,i,8), a + movq a, (z,i,8) + incq i + cmpq n, i + jc bignum_copy_copyloop + +bignum_copy_padding: + cmpq k, i + jnc bignum_copy_end + xorq a, a + +bignum_copy_padloop: + movq a, (z,i,8) + incq i + cmpq k, i + jc bignum_copy_padloop + +bignum_copy_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ctd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ctd.S new file mode 100644 index 00000000000..954f386bb66 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ctd.S @@ -0,0 +1,71 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count trailing zero digits (64-bit words) +// Input x[k]; output function return +// +// extern uint64_t bignum_ctd (uint64_t k, uint64_t *x); +// +// In the case of a zero bignum as input the result is k +// +// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ctd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ctd) + .text + +#define k %rdi +#define x %rsi +#define i %rdx +#define a %rax + + + +S2N_BN_SYMBOL(bignum_ctd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// If the bignum is zero-length, just return 0 + + xorq %rax, %rax + testq k, k + jz bignum_ctd_end + +// Record in i that the lowest nonzero word is i - 1, where i = k + 1 means +// that the bignum was entirely zero + + movq k, i + incq i +bignum_ctd_loop: + movq -8(x,k,8), a + testq a, a + cmovneq k, i + decq k + jnz bignum_ctd_loop + +// We now want to return i - 1 + + decq i + movq i, %rax +bignum_ctd_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ctz.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ctz.S new file mode 100644 index 00000000000..5dd61099564 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ctz.S @@ -0,0 +1,87 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count trailing zero bits +// Input x[k]; output function return +// +// extern uint64_t bignum_ctz (uint64_t k, uint64_t *x); +// +// +// In the case of a zero bignum as input the result is 64 * k +// +// In principle this has a precondition k < 2^58, but obviously that +// is always true in practice because of address space limitations +// +// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ctz) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ctz) + .text + +#define k %rdi +#define x %rsi +#define i %rdx +#define w %rcx +#define a %rax + +#define wshort %ecx + + + +S2N_BN_SYMBOL(bignum_ctz): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// If the bignum is zero-length, just return 0 + + xorq %rax, %rax + testq k, k + jz bignum_ctz_end + +// Use w = a[i-1] to store nonzero words in a top-down sweep +// Set the initial default to be as if we had a 1 word directly above + + movq k, i + incq i + movl $1, wshort + +bignum_ctz_loop: + movq -8(x,k,8), a + testq a, a + cmovneq k, i + cmovneq a, w + decq k + jnz bignum_ctz_loop + +// Now w = a[i-1] is the lowest nonzero word, or in the zero case the +// default of the "extra" 1 = a[k]. We now want 64*(i-1) + ctz(w). +// Note that this code does not rely on the behavior of the BSF instruction +// for zero inputs, which is undefined according to the manual. + + decq i + shlq $6, i + bsfq w, %rax + addq i, %rax + +bignum_ctz_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_demont.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_demont.S new file mode 100644 index 00000000000..ee9ca9cfa5c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_demont.S @@ -0,0 +1,204 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from (almost-)Montgomery form z := (x / 2^{64k}) mod m +// Inputs x[k], m[k]; output z[k] +// +// extern void bignum_demont +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); +// +// Does z := (x / 2^{64k}) mod m, hence mapping out of Montgomery domain. +// In other words, this is a k-fold Montgomery reduction with same-size input. +// This can handle almost-Montgomery inputs, i.e. any k-digit bignum. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = m +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = m +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont) + .text + +#define k %rdi +#define z %rsi +#define x %rdx +#define m %rcx + +// General temp, low part of product and mul input +#define a %rax +// General temp, high part of product (no longer x) +#define b %rdx +// Negated modular inverse +#define w %r8 +// Outer loop counter +#define i %r9 +// Inner loop counter +#define j %rbx +// Home for Montgomery multiplier +#define d %rbp +#define h %r10 +#define e %r11 +#define n %r12 + +// A temp reg in the initial word-level negmodinv, same as j + +#define t %rbx + +#define ashort %eax +#define jshort %ebx + + +S2N_BN_SYMBOL(bignum_demont): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Save registers + + pushq %rbx + pushq %rbp + pushq %r12 + +// If k = 0 the whole operation is trivial + + testq k, k + jz bignum_demont_end + +// Compute word-level negated modular inverse w for m == m[0]. + + movq (m), a + + movq a, t + movq a, w + shlq $2, t + subq t, w + xorq $2, w + + movq w, t + imulq a, t + movl $2, ashort + addq t, a + addq $1, t + + imulq a, w + + imulq t, t + movl $1, ashort + addq t, a + imulq a, w + + imulq t, t + movl $1, ashort + addq t, a + imulq a, w + + imulq t, t + movl $1, ashort + addq t, a + imulq a, w + +// Initially just copy the input to the output. It would be a little more +// efficient but somewhat fiddlier to tweak the zeroth iteration below instead. +// After this we never use x again and can safely recycle RDX for muls + + xorq j, j +bignum_demont_iloop: + movq (x,j,8), a + movq a, (z,j,8) + incq j + cmpq k, j + jc bignum_demont_iloop + +// Outer loop, just doing a standard Montgomery reduction on z + + xorq i, i + +bignum_demont_outerloop: + movq (z), e + movq w, d + imulq e, d + movq (m), a + mulq d + addq e, a // Will be zero but want the carry + movq %rdx, h + movl $1, jshort + movq k, n + decq n + jz bignum_demont_montend + +bignum_demont_montloop: + adcq (z,j,8), h + sbbq e, e + movq (m,j,8), a + mulq d + subq e, %rdx + addq h, a + movq a, -8(z,j,8) + movq %rdx, h + incq j + decq n + jnz bignum_demont_montloop + +bignum_demont_montend: + adcq $0, h + movq h, -8(z,j,8) + +// End of outer loop. + + incq i + cmpq k, i + jc bignum_demont_outerloop + +// Now do a comparison of z with m to set a final correction mask +// indicating that z >= m and so we need to subtract m. + + xorq j, j + movq k, n +bignum_demont_cmploop: + movq (z,j,8), a + sbbq (m,j,8), a + incq j + decq n + jnz bignum_demont_cmploop + sbbq d, d + notq d + +// Now do a masked subtraction of m for the final reduced result. + + xorq e, e + xorq j, j +bignum_demont_corrloop: + movq (m,j,8), a + andq d, a + negq e + sbbq a, (z,j,8) + sbbq e, e + incq j + cmpq k, j + jc bignum_demont_corrloop + +bignum_demont_end: + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_digit.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_digit.S new file mode 100644 index 00000000000..3e41e61b9c6 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_digit.S @@ -0,0 +1,72 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Select digit x[n] +// Inputs x[k], n; output function return +// +// extern uint64_t bignum_digit (uint64_t k, uint64_t *x, uint64_t n); +// +// n'th digit of a k-digit (digit=64 bits) bignum, in constant-time style. +// Indexing starts at 0, which is the least significant digit (little-endian). +// Returns zero if n >= k, i.e. we read a digit off the end of the bignum. +// +// Standard x86-64 ABI: RDI = k, RSI = x, RDX = n, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, R8 = n, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_digit) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_digit) + .text + +#define k %rdi +#define x %rsi +#define n %rdx + +#define d %rax +#define i %rcx +#define a %r8 + +S2N_BN_SYMBOL(bignum_digit): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Set the default digit to 0, and for length zero finish immediately + + xorq d, d + testq k, k + jz bignum_digit_end + +// Main loop: run over all the digits and take note of the n'th one + + xorq i, i +bignum_digit_loop: + movq (x,i,8), a + cmpq n, i + cmovzq a, d + incq i + cmpq k, i + jc bignum_digit_loop + +// Return + +bignum_digit_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_digitsize.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_digitsize.S new file mode 100644 index 00000000000..a1902b6b4f5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_digitsize.S @@ -0,0 +1,70 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Return size of bignum in digits (64-bit word) +// Input x[k]; output function return +// +// extern uint64_t bignum_digitsize (uint64_t k, uint64_t *x); +// +// In the case of a zero bignum as input the result is 0 +// +// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_digitsize) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_digitsize) + .text + +#define k %rdi +#define x %rsi +#define i %rax +#define a %rcx +#define j %rdx + + + +S2N_BN_SYMBOL(bignum_digitsize): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Initialize the index i and also prepare default return value of 0 (i = %rax) + + xorq i, i + +// If the bignum is zero-length, just return 0 + + testq k, k + jz bignum_digitsize_end + +// Run over the words j = 0..i-1, and set i := j + 1 when hitting nonzero a[j] + + xorq j, j +bignum_digitsize_loop: + movq (x,j,8), a + incq j + testq a, a + cmovnzq j, i + cmpq k, j + jnz bignum_digitsize_loop + +bignum_digitsize_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_divmod10.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_divmod10.S new file mode 100644 index 00000000000..14bbc9e37dd --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_divmod10.S @@ -0,0 +1,98 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Divide bignum by 10: z' := z div 10, returning remainder z mod 10 +// Inputs z[k]; outputs function return (remainder) and z[k] +// +// extern uint64_t bignum_divmod10 (uint64_t k, uint64_t *z); +// +// Standard x86-64 ABI: RDI = k, RSI = z, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_divmod10) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_divmod10) + .text + +#define k %rdi +#define z %rsi + +#define d %rcx + +#define l %rdx +#define r %rax + +#define q %r8 +#define h %r8 + +#define s %r9 +#define w %r10 + +#define rshort %eax +#define wshort %r10d + +S2N_BN_SYMBOL(bignum_divmod10): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Initialize remainder to 0 and if k = 0 return + + xorl rshort, rshort + testq k, k + jz bignum_divmod10_end + +// Straightforward top-down loop doing 10 * q + r' := 2^64 * r + d + + movq $0x3333333333333334, s + movl $0x3333333, wshort + +bignum_divmod10_divloop: + movq -8(z,k,8), d + +// First re-split and shift so 2^28 * h + l = (2^64 * r + d) / 2 +// Then (2^64 * r + d) / 10 = [(2^28 - 1) / 5] * h + (h + l) / 5 + + movq d, l + shlq $35, l + shldq $35, d, r + shrq $36, l + movq r, h + + addq l, r + mulq s + imulq w, h + addq l, q + movq q, -8(z,k,8) + +// Generate the new remainder r = d - 10 * q +// Since r <= 9 we only need the low part computation ignoring carries + + leaq (q,q,4), q + negq q + leaq (d,q,2), r + + decq k + jnz bignum_divmod10_divloop + +// Return %rax = r as the final remainder + +bignum_divmod10_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_emontredc.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_emontredc.S new file mode 100644 index 00000000000..e39905a1fa1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_emontredc.S @@ -0,0 +1,155 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Extended Montgomery reduce, returning results in input-output buffer +// Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] +// +// extern uint64_t bignum_emontredc +// (uint64_t k, uint64_t *z, uint64_t *m, uint64_t w); +// +// Assumes that z initially holds a 2k-digit bignum z_0, m is a k-digit odd +// bignum and m * w == -1 (mod 2^64). This function also uses z for the output +// as well as returning a carry c of 0 or 1. This encodes two numbers: in the +// lower half of the z buffer we have q = z[0..k-1], while the upper half +// together with the carry gives r = 2^{64k}*c + z[k..2k-1]. These values +// satisfy z_0 + q * m = 2^{64k} * r, i.e. r gives a raw (unreduced) Montgomery +// reduction while q gives the multiplier that was used. Another way of +// thinking of it is that if z' is the output z with the lower half replaced +// with zeros, then z_0 + q * m = 2^{128k} * c + z'. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = w, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = w, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_emontredc) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_emontredc) + .text + +// Argument m comes in in %rdx but we copy it to %r8 + +#define k %rdi +#define z %rsi +#define m %r8 +#define w %rcx + +// General temp, low part of product and mul input +#define a %rax +// General temp, High part of product +#define b %rdx +// Home for i'th digit or Montgomery multiplier +#define d %rbx + +// Outer loop counter +#define i %r9 +// Inner loop counter +#define j %r10 + +#define h %r11 +#define e %r12 +#define t %r13 +#define c %r14 + +#define cshort %r14d +#define jshort %r10d + + +S2N_BN_SYMBOL(bignum_emontredc): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Save registers + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + +// Initialize top carry to zero immediately to catch the k = 0 case + + xorq c, c + +// If k = 0 the whole operation is trivial + + testq k, k + jz bignum_emontredc_end + +// Move m into its permanent home since we need RDX for muls + + movq %rdx, m + +// Launch into the outer loop + + xorq i, i +bignum_emontredc_outerloop: + movq (z), e + movq w, d + imulq e, d + movq (m), a + mulq d + movq d, (z) + addq e, a // Will be zero but want the carry + movq %rdx, h + movl $1, jshort + movq k, t + decq t + jz bignum_emontredc_montend + +bignum_emontredc_montloop: + adcq (z,j,8), h + sbbq e, e + movq (m,j,8), a + mulq d + subq e, %rdx + addq h, a + movq a, (z,j,8) + movq %rdx, h + incq j + decq t + jnz bignum_emontredc_montloop + +bignum_emontredc_montend: + adcq c, h + movl $0, cshort + adcq $0, c + movq (z,k,8), a + addq h, a + movq a, (z,k,8) + adcq $0, c + +// End of outer loop. + + addq $8, z // For simple indexing, z pointer moves + incq i + cmpq k, i + jc bignum_emontredc_outerloop + +bignum_emontredc_end: + +// Put the top carry in the expected place, restore registers and return + + movq c, %rax + popq %r14 + popq %r13 + popq %r12 + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_eq.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_eq.S new file mode 100644 index 00000000000..5e6f4f82e28 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_eq.S @@ -0,0 +1,97 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Test bignums for equality, x = y +// Inputs x[m], y[n]; output function return +// +// extern uint64_t bignum_eq +// (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Standard x86-64 ABI: RDI = m, RSI = x, RDX = n, RCX = y, returns RAX +// Microsoft x64 ABI: RCX = m, RDX = x, R8 = n, R9 = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_eq) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_eq) + .text + +#define m %rdi +#define x %rsi +#define n %rdx +#define y %rcx +#define c %rax +// We can re-use n for this, not needed when d appears +#define d %rdx + +S2N_BN_SYMBOL(bignum_eq): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Initialize the accumulated OR of differences to zero + + xorq c, c + +// If m >= n jump into the m > n loop at the final equality test +// This will drop through for m = n + + cmpq n, m + jnc bignum_eq_mtest + +// Toploop for the case n > m + +bignum_eq_nloop: + decq n + orq (y,n,8), c + cmpq n, m + jnz bignum_eq_nloop + jmp bignum_eq_mmain + +// Toploop for the case m > n (or n = m which enters at "mtest") + +bignum_eq_mloop: + decq m + orq (x,m,8), c + cmpq n, m +bignum_eq_mtest: + jnz bignum_eq_mloop + +// Combined main loop for the min(m,n) lower words + +bignum_eq_mmain: + testq m, m + jz bignum_eq_end + +bignum_eq_loop: + movq -8(x,m,8), d + xorq -8(y,m,8), d + orq d, c + decq m + jnz bignum_eq_loop + +// Set a standard C condition based on whether c is nonzero + +bignum_eq_end: + negq c + sbbq c, c + incq c +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_even.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_even.S new file mode 100644 index 00000000000..2f66295cc81 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_even.S @@ -0,0 +1,52 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Test bignum for even-ness +// Input x[k]; output function return +// +// extern uint64_t bignum_even (uint64_t k, uint64_t *x); +// +// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_even) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_even) + .text + +S2N_BN_SYMBOL(bignum_even): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Set default return value of 1 and finish if k = 0 (trivially even) + + movl $1, %eax + testq %rdi, %rdi + jz bignum_even_end + +// Otherwise XOR that initial 1 with the lowest bit of the input + + xorq (%rsi), %rax + andq $1, %rax + +bignum_even_end: + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ge.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ge.S new file mode 100644 index 00000000000..ccc237e565b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_ge.S @@ -0,0 +1,111 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compare bignums, x >= y +// Inputs x[m], y[n]; output function return +// +// extern uint64_t bignum_ge +// (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Standard x86-64 ABI: RDI = m, RSI = x, RDX = n, RCX = y, returns RAX +// Microsoft x64 ABI: RCX = m, RDX = x, R8 = n, R9 = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_ge) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_ge) + .text + +#define m %rdi +#define x %rsi +#define n %rdx +#define y %rcx +#define i %r8 +#define a %rax + +#define ashort %eax + + + +S2N_BN_SYMBOL(bignum_ge): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Zero the main index counter for both branches + + xorq i, i + +// Speculatively form m := m - n and do case split + + subq n, m + jc bignum_ge_ylonger + +// The case where x is longer or of the same size (m >= n) + + incq m + testq n, n + jz bignum_ge_xtest +bignum_ge_xmainloop: + movq (x,i,8), a + sbbq (y,i,8), a + incq i + decq n + jnz bignum_ge_xmainloop + jmp bignum_ge_xtest +bignum_ge_xtoploop: + movq (x,i,8), a + sbbq $0, a + incq i +bignum_ge_xtest: + decq m + jnz bignum_ge_xtoploop + sbbq a, a + incq a +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +// The case where y is longer (n > m) + +bignum_ge_ylonger: + addq n, m + subq m, n + testq m, m + jz bignum_ge_ytoploop +bignum_ge_ymainloop: + movq (x,i,8), a + sbbq (y,i,8), a + incq i + decq m + jnz bignum_ge_ymainloop +bignum_ge_ytoploop: + movl $0, ashort + sbbq (y,i,8), a + incq i + decq n + jnz bignum_ge_ytoploop + + sbbq a, a + incq a +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_gt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_gt.S new file mode 100644 index 00000000000..e2673ad390e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_gt.S @@ -0,0 +1,111 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compare bignums, x > y +// Inputs x[m], y[n]; output function return +// +// extern uint64_t bignum_gt +// (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Standard x86-64 ABI: RDI = m, RSI = x, RDX = n, RCX = y, returns RAX +// Microsoft x64 ABI: RCX = m, RDX = x, R8 = n, R9 = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_gt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_gt) + .text + +#define m %rdi +#define x %rsi +#define n %rdx +#define y %rcx +#define i %r8 +#define a %rax + +#define ashort %eax + + + +S2N_BN_SYMBOL(bignum_gt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Zero the main index counter for both branches + + xorq i, i + +// Speculatively form n := n - m and do case split + + subq m, n + jc bignum_gt_ylonger + +// The case where y is longer or of the same size (n >= m) + + incq n + testq m, m + jz bignum_gt_xtest +bignum_gt_xmainloop: + movq (y,i,8), a + sbbq (x,i,8), a + incq i + decq m + jnz bignum_gt_xmainloop + jmp bignum_gt_xtest +bignum_gt_xtoploop: + movq (y,i,8), a + sbbq $0, a + incq i +bignum_gt_xtest: + decq n + jnz bignum_gt_xtoploop + sbbq a, a + negq a +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +// The case where x is longer (m > n) + +bignum_gt_ylonger: + addq m, n + subq n, m + testq n, n + jz bignum_gt_ytoploop +bignum_gt_ymainloop: + movq (y,i,8), a + sbbq (x,i,8), a + incq i + decq n + jnz bignum_gt_ymainloop +bignum_gt_ytoploop: + movl $0, ashort + sbbq (x,i,8), a + incq i + decq m + jnz bignum_gt_ytoploop + + sbbq a, a + negq a +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_iszero.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_iszero.S new file mode 100644 index 00000000000..f33b8fc714e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_iszero.S @@ -0,0 +1,58 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Test bignum for zero-ness, x = 0 +// Input x[k]; output function return +// +// extern uint64_t bignum_iszero (uint64_t k, uint64_t *x); +// +// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_iszero) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_iszero) + .text + +#define a %rax +#define k %rdi +#define x %rsi + +S2N_BN_SYMBOL(bignum_iszero): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + xorq a, a + testq k, k + jz bignum_iszero_end + +bignum_iszero_loop: + orq -8(x,k,8), a + decq k + jnz bignum_iszero_loop + +// Set a standard C condition based on whether a is nonzero + + negq a + sbbq a, a + +bignum_iszero_end: + incq a +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_le.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_le.S new file mode 100644 index 00000000000..114755d29d1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_le.S @@ -0,0 +1,111 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compare bignums, x <= y +// Inputs x[m], y[n]; output function return +// +// extern uint64_t bignum_le +// (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Standard x86-64 ABI: RDI = m, RSI = x, RDX = n, RCX = y, returns RAX +// Microsoft x64 ABI: RCX = m, RDX = x, R8 = n, R9 = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_le) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_le) + .text + +#define m %rdi +#define x %rsi +#define n %rdx +#define y %rcx +#define i %r8 +#define a %rax + +#define ashort %eax + + + +S2N_BN_SYMBOL(bignum_le): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Zero the main index counter for both branches + + xorq i, i + +// Speculatively form n := n - m and do case split + + subq m, n + jc bignum_le_ylonger + +// The case where y is longer or of the same size (n >= m) + + incq n + testq m, m + jz bignum_le_xtest +bignum_le_xmainloop: + movq (y,i,8), a + sbbq (x,i,8), a + incq i + decq m + jnz bignum_le_xmainloop + jmp bignum_le_xtest +bignum_le_xtoploop: + movq (y,i,8), a + sbbq $0, a + incq i +bignum_le_xtest: + decq n + jnz bignum_le_xtoploop + sbbq a, a + incq a +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +// The case where x is longer (m > n) + +bignum_le_ylonger: + addq m, n + subq n, m + testq n, n + jz bignum_le_ytoploop +bignum_le_ymainloop: + movq (y,i,8), a + sbbq (x,i,8), a + incq i + decq n + jnz bignum_le_ymainloop +bignum_le_ytoploop: + movl $0, ashort + sbbq (x,i,8), a + incq i + decq m + jnz bignum_le_ytoploop + + sbbq a, a + incq a +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_lt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_lt.S new file mode 100644 index 00000000000..95a1cc3c979 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_lt.S @@ -0,0 +1,111 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compare bignums, x < y +// Inputs x[m], y[n]; output function return +// +// extern uint64_t bignum_lt +// (uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Standard x86-64 ABI: RDI = m, RSI = x, RDX = n, RCX = y, returns RAX +// Microsoft x64 ABI: RCX = m, RDX = x, R8 = n, R9 = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_lt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_lt) + .text + +#define m %rdi +#define x %rsi +#define n %rdx +#define y %rcx +#define i %r8 +#define a %rax + +#define ashort %eax + + + +S2N_BN_SYMBOL(bignum_lt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Zero the main index counter for both branches + + xorq i, i + +// Speculatively form m := m - n and do case split + + subq n, m + jc bignum_lt_ylonger + +// The case where x is longer or of the same size (m >= n) + + incq m + testq n, n + jz bignum_lt_xtest +bignum_lt_xmainloop: + movq (x,i,8), a + sbbq (y,i,8), a + incq i + decq n + jnz bignum_lt_xmainloop + jmp bignum_lt_xtest +bignum_lt_xtoploop: + movq (x,i,8), a + sbbq $0, a + incq i +bignum_lt_xtest: + decq m + jnz bignum_lt_xtoploop + sbbq a, a + negq a +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +// The case where y is longer (n > m) + +bignum_lt_ylonger: + addq n, m + subq m, n + testq m, m + jz bignum_lt_ytoploop +bignum_lt_ymainloop: + movq (x,i,8), a + sbbq (y,i,8), a + incq i + decq m + jnz bignum_lt_ymainloop +bignum_lt_ytoploop: + movl $0, ashort + sbbq (y,i,8), a + incq i + decq n + jnz bignum_lt_ytoploop + + sbbq a, a + negq a +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_madd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_madd.S new file mode 100644 index 00000000000..4f5f876424a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_madd.S @@ -0,0 +1,162 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply-add, z := z + x * y +// Inputs x[m], y[n]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_madd +// (uint64_t k, uint64_t *z, +// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Does the "z := x * y + z" operation, while also returning a "next" or +// "carry" word. In the case where m + n <= p (i.e. the pure product would +// fit in the destination) this is the remainder for the exact result. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_madd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_madd) + .text + +// These are actually right + +#define p %rdi +#define z %rsi +#define n %r8 + +// These are not + +#define c %r15 +#define h %r14 +#define l %r13 +#define x %r12 +#define y %r11 +#define i %rbx +#define k %r10 +#define m %rbp + +// These are always local scratch since multiplier result is in these + +#define a %rax +#define d %rdx + + + +S2N_BN_SYMBOL(bignum_madd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 + movq 64(%rsp), %r9 +#endif + +// We use too many registers, and also we need %rax:%rdx for multiplications + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx, m + +// If the result size is zero, just return %rax = 0 +// We could also do this if either input is size 0. + + xorq %rax, %rax + testq p, p + jz bignum_madd_end + +// Set initial 2-part sum to zero (we zero c inside the body) + + xorq h, h + xorq l, l + +// Otherwise do outer loop k = 0 ... k = p - 1 + + xorq k, k + +bignum_madd_outerloop: + +// Zero our carry term first; we eventually want it and a zero is useful now +// Set a = max 0 (k + 1 - n), i = min (k + 1) m +// This defines the range a <= j < i for the inner summation +// Note that since k < p < 2^64 we can assume k + 1 doesn't overflow +// And since we want to increment it anyway, we might as well do it now + + xorq c, c // c = 0 + incq k // k = k + 1 + + movq k, a // a = k + 1 + subq n, a // a = k + 1 - n + cmovcq c, a // a = max 0 (k + 1 - n) + + movq m, i // i = m + cmpq m, k // CF <=> k + 1 < m + cmovcq k, i // i = min (k + 1) m + +// Turn i into a loop count, and skip things if it's <= 0 +// Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a] +// and then launch into the main inner loop, postdecrementing i + + movq k, d + subq i, d + subq a, i + jbe bignum_madd_innerend + leaq (%rcx,a,8), x + leaq -8(%r9,d,8), y + +bignum_madd_innerloop: + movq (y,i,8), %rax + mulq (x) + addq $8, x + addq %rax, l + adcq %rdx, h + adcq $0, c + decq i + jnz bignum_madd_innerloop + +bignum_madd_innerend: + + addq l, (z) + adcq $0, h + adcq $0, c + movq h, l + movq c, h + addq $8, z + + cmpq p, k + jc bignum_madd_outerloop + +// Move the carry term into the return value + + movq l, %rax + +bignum_madd_end: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modadd.S new file mode 100644 index 00000000000..351ed07515e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modadd.S @@ -0,0 +1,99 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Add modulo m, z := (x + y) mod m, assuming x and y reduced +// Inputs x[k], y[k], m[k]; output z[k] +// +// extern void bignum_modadd +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modadd) + .text + +#define k %rdi +#define z %rsi +#define x %rdx +#define y %rcx +#define m %r8 +#define i %r9 +#define j %r10 +#define a %rax +#define c %r11 + +S2N_BN_SYMBOL(bignum_modadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// If k = 0 do nothing + + testq k, k + jz bignum_modadd_end + +// First just add (c::z) := x + y + + xorq c, c + movq k, j + xorq i, i +bignum_modadd_addloop: + movq (x,i,8), a + adcq (y,i,8), a + movq a, (z,i,8) + incq i + decq j + jnz bignum_modadd_addloop + adcq $0, c + +// Now do a comparison subtraction (c::z) - m, recording mask for (c::z) >= m + + movq k, j + xorq i, i +bignum_modadd_cmploop: + movq (z,i,8), a + sbbq (m,i,8), a + incq i + decq j + jnz bignum_modadd_cmploop + sbbq $0, c + notq c + +// Now do a masked subtraction z := z - [c] * m + + xorq i, i +bignum_modadd_subloop: + movq (m,i,8), a + andq c, a + negq j + sbbq a, (z,i,8) + sbbq j, j + incq i + cmpq k, i + jc bignum_modadd_subloop + +bignum_modadd_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_moddouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_moddouble.S new file mode 100644 index 00000000000..e684d51b2ab --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_moddouble.S @@ -0,0 +1,93 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Double modulo m, z := (2 * x) mod m, assuming x reduced +// Inputs x[k], m[k]; output z[k] +// +// extern void bignum_moddouble +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = m +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = m +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_moddouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_moddouble) + .text + +#define k %rdi +#define z %rsi +#define x %rdx +#define m %rcx +#define i %r8 +#define a %r9 +#define c %rax +#define b %r10 + +S2N_BN_SYMBOL(bignum_moddouble): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// If k = 0 do nothing + + testq k, k + jz bignum_moddouble_end + +// Do (_::z) = 2 * x - m and generate a mask in c for 2 * x < m + + xorq c, c + xorq i, i + xorq b, b + +bignum_moddouble_dubloop: + movq (x,i,8), a + shrdq $63, a, c + negq b + sbbq (m,i,8), c + sbbq b, b + movq c, (z,i,8) + movq a, c + incq i + cmpq k, i + jc bignum_moddouble_dubloop + shrq $63, c + + addq b, c + +// Now do a corrective masked addition z := z + [c] * m + + xorq i, i + xorq b, b +bignum_moddouble_corrloop: + movq (m,i,8), a + andq c, a + negq b + adcq (z,i,8), a + sbbq b, b + movq a, (z,i,8) + incq i + cmpq k, i + jc bignum_moddouble_corrloop + +bignum_moddouble_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modexp.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modexp.S new file mode 100644 index 00000000000..d5dc9b70a3c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modexp.S @@ -0,0 +1,671 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular exponentiation for arbitrary odd modulus +// Inputs a[k], p[k], m[k]; output z[k], temporary buffer t[>=3*k] +// +// extern void bignum_modexp +// (uint64_t k,uint64_t *z, uint64_t *a,uint64_t *p,uint64_t *m,uint64_t *t); +// +// Does z := (a^p) mod m where all numbers are k-digit and m is odd +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = a, RCX = p, R8 = m, R9 = t +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = a, R9 = p, [RSP+40] = m, [RSP+48] = t +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modexp) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modexp) + .text + +// Local variables, all kept on the stack + +#define k (%rsp) +#define res 8(%rsp) +#define a 16(%rsp) +#define p 24(%rsp) +#define m 32(%rsp) +#define x 40(%rsp) +#define i 48(%rsp) +#define y 56(%rsp) +#define z 64(%rsp) + +#define VARSIZE 72 + +S2N_BN_SYMBOL(bignum_modexp): + _CET_ENDBR + +// The Windows version literally calls the standard ABI version. +// This simplifies the proofs since subroutine offsets are fixed. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 + movq 64(%rsp), %r9 + callq bignum_modexp_standard + popq %rsi + popq %rdi + ret + +bignum_modexp_standard: +#endif + +// Real start of the standard ABI code. +// Bump down the stack to make room for local variables + + subq $VARSIZE, %rsp + +// If size is zero (which falsifies the oddness condition) do nothing + + testq %rdi, %rdi + jz bignum_modexp_end + +// Set up local variables based on input parameters + + movq %rdi, k + movq %rsi, res + movq %rdx, a + movq %rcx, p + movq %r8, m + movq %r9, x + leaq (%r9,%rdi,8), %rax + movq %rax, y + leaq (%rax,%rdi,8), %rax + movq %rax, z + +// Let x == 2^64k * a (mod m) and initialize z == 2^64k * 1 (mod m) + + movq k, %rdi + movq z, %rsi + movq m, %rdx + movq y, %rcx + callq bignum_modexp_local_amontifier + + movq k, %rdi + movq x, %rsi + movq z, %rdx + movq a, %rcx + movq m, %r8 + callq bignum_modexp_local_amontmul + + movq k, %rdi + movq z, %rsi + movq z, %rdx + movq m, %rcx + callq bignum_modexp_local_demont + +// Main loop with z == 2^64k * a^(p >> 2^i) (mod m) + + movq k, %rax + shlq $6, %rax + movq %rax, i + +bignum_modexp_loop: + subq $1, %rax + movq %rax, i + + movq k, %rdi + movq y, %rsi + movq z, %rdx + movq z, %rcx + movq m, %r8 + callq bignum_modexp_local_amontmul + + movq k, %rdi + movq z, %rsi + movq x, %rdx + movq y, %rcx + movq m, %r8 + callq bignum_modexp_local_amontmul + + movq i, %rdx + movq %rdx, %rcx + shrq $6, %rdx + movq p, %rsi + movq (%rsi,%rdx,8), %rdi + shrq %cl, %rdi + andq $1, %rdi + + movq k, %rsi + movq z, %rdx + movq z, %rcx + movq y, %r8 + callq bignum_modexp_local_mux + + movq i, %rax + testq %rax, %rax + jnz bignum_modexp_loop + +// Convert back from Montgomery representation and copy the result +// (via a degenerate case of multiplexing) into the output buffer + + movq k, %rdi + movq z, %rsi + movq z, %rdx + movq m, %rcx + callq bignum_modexp_local_demont + + xorl %edi, %edi + movq k, %rsi + movq res, %rdx + movq z, %rcx + movq z, %r8 + callq bignum_modexp_local_mux + +// Restore the stack pointer and return + +bignum_modexp_end: + addq $VARSIZE, %rsp + ret + +// Local copy of bignum_amontifier + +bignum_modexp_local_amontifier: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + movq %rdx, %r12 + movq %rcx, %r13 + testq %rdi, %rdi + je bignum_modexp_amontifier_end + xorq %rbx, %rbx +bignum_modexp_copyinloop: + movq (%r12,%rbx,8), %rcx + movq %rcx, (%r13,%rbx,8) + incq %rbx + cmpq %rdi, %rbx + jb bignum_modexp_copyinloop + movq %rdi, %rbx + decq %rbx + je bignum_modexp_normalized +bignum_modexp_normloop: + xorq %rbp, %rbp + movq %rdi, %r11 + negq %rcx + movl $0x0, %eax +bignum_modexp_shufloop: + movq %rax, %rcx + movq (%r13,%rbp,8), %rax + cmovbq %rax, %rcx + movq %rcx, (%r13,%rbp,8) + incq %rbp + decq %r11 + jne bignum_modexp_shufloop + decq %rbx + jne bignum_modexp_normloop +bignum_modexp_normalized: + bsrq %rcx, %rcx + xorq $0x3f, %rcx + xorq %r9, %r9 + xorq %rbx, %rbx +bignum_modexp_bitloop: + movq (%r13,%rbx,8), %rax + movq %rax, %rbp + shldq %cl, %r9, %rax + movq %rax, (%r13,%rbx,8) + movq %rbp, %r9 + incq %rbx + cmpq %rdi, %rbx + jb bignum_modexp_bitloop + movq -0x8(%r13,%rdi,8), %r11 + movl $0x1, %r8d + movq %r11, %r9 + negq %r9 + movl $0x3e, %ebx +bignum_modexp_estloop: + addq %r8, %r8 + movq %r11, %rax + subq %r9, %rax + cmpq %rax, %r9 + sbbq %rax, %rax + notq %rax + subq %rax, %r8 + addq %r9, %r9 + andq %r11, %rax + subq %rax, %r9 + decq %rbx + jne bignum_modexp_estloop + incq %r9 + cmpq %r9, %r11 + adcq $0x0, %r8 + xorq %rcx, %rcx + xorq %rbx, %rbx +bignum_modexp_mulloop: + movq (%r13,%rbx,8), %rax + mulq %r8 + addq %rcx, %rax + adcq $0x0, %rdx + movq %rax, (%rsi,%rbx,8) + movq %rdx, %rcx + incq %rbx + cmpq %rdi, %rbx + jb bignum_modexp_mulloop + movabs $0x4000000000000000, %rax + subq %rax, %rcx + sbbq %r8, %r8 + notq %r8 + xorq %rcx, %rcx + xorq %rbx, %rbx +bignum_modexp_remloop: + movq (%r13,%rbx,8), %rax + andq %r8, %rax + negq %rcx + sbbq (%rsi,%rbx,8), %rax + sbbq %rcx, %rcx + movq %rax, (%rsi,%rbx,8) + incq %rbx + cmpq %rdi, %rbx + jb bignum_modexp_remloop + xorq %rcx, %rcx + xorq %rbp, %rbp + xorq %r9, %r9 +bignum_modexp_dubloop1: + movq (%rsi,%rbp,8), %rax + shrdq $0x3f, %rax, %rcx + negq %r9 + sbbq (%r13,%rbp,8), %rcx + sbbq %r9, %r9 + movq %rcx, (%rsi,%rbp,8) + movq %rax, %rcx + incq %rbp + cmpq %rdi, %rbp + jb bignum_modexp_dubloop1 + shrq $0x3f, %rcx + addq %r9, %rcx + xorq %rbp, %rbp + xorq %r9, %r9 +bignum_modexp_corrloop1: + movq (%r13,%rbp,8), %rax + andq %rcx, %rax + negq %r9 + adcq (%rsi,%rbp,8), %rax + sbbq %r9, %r9 + movq %rax, (%rsi,%rbp,8) + incq %rbp + cmpq %rdi, %rbp + jb bignum_modexp_corrloop1 + xorq %rcx, %rcx + xorq %rbp, %rbp + xorq %r9, %r9 +bignum_modexp_dubloop2: + movq (%rsi,%rbp,8), %rax + shrdq $0x3f, %rax, %rcx + negq %r9 + sbbq (%r13,%rbp,8), %rcx + sbbq %r9, %r9 + movq %rcx, (%rsi,%rbp,8) + movq %rax, %rcx + incq %rbp + cmpq %rdi, %rbp + jb bignum_modexp_dubloop2 + shrq $0x3f, %rcx + addq %r9, %rcx + xorq %rbp, %rbp + xorq %r9, %r9 +bignum_modexp_corrloop2: + movq (%r13,%rbp,8), %rax + andq %rcx, %rax + negq %r9 + adcq (%rsi,%rbp,8), %rax + sbbq %r9, %r9 + movq %rax, (%rsi,%rbp,8) + movq %rax, (%r13,%rbp,8) + incq %rbp + cmpq %rdi, %rbp + jb bignum_modexp_corrloop2 + xorq %r11, %r11 + movq %rdi, %rbx +bignum_modexp_modloop: + xorq %r9, %r9 + movq %rdi, %r8 + xorq %rbp, %rbp + xorq %rcx, %rcx +bignum_modexp_cmaloop: + adcq %r9, %rcx + sbbq %r10, %r10 + movq (%rsi,%rbp,8), %rax + mulq %r11 + subq %r10, %rdx + addq %rcx, %rax + movq (%r13,%rbp,8), %r9 + movq %rax, (%r13,%rbp,8) + movq %rdx, %rcx + incq %rbp + decq %r8 + jne bignum_modexp_cmaloop + adcq %rcx, %r9 + movq %r9, %r11 + sbbq %r10, %r10 + xorq %rbp, %rbp + xorq %rcx, %rcx +bignum_modexp_oaloop: + movq (%r13,%rbp,8), %rax + movq (%rsi,%rbp,8), %r9 + andq %r10, %r9 + negq %rcx + adcq %r9, %rax + sbbq %rcx, %rcx + movq %rax, (%r13,%rbp,8) + incq %rbp + cmpq %rdi, %rbp + jb bignum_modexp_oaloop + subq %rcx, %r11 + decq %rbx + jne bignum_modexp_modloop + movq (%r12), %rax + movq %rax, %rcx + movq %rax, %r9 + shlq $0x2, %rcx + subq %rcx, %r9 + xorq $0x2, %r9 + movq %r9, %rcx + imulq %rax, %rcx + movl $0x2, %eax + addq %rcx, %rax + addq $0x1, %rcx + imulq %rax, %r9 + imulq %rcx, %rcx + movl $0x1, %eax + addq %rcx, %rax + imulq %rax, %r9 + imulq %rcx, %rcx + movl $0x1, %eax + addq %rcx, %rax + imulq %rax, %r9 + imulq %rcx, %rcx + movl $0x1, %eax + addq %rcx, %rax + imulq %rax, %r9 + movq (%r13), %rcx + imulq %rcx, %r9 + movq (%r12), %rax + mulq %r9 + addq %rcx, %rax + movq %rdx, %rcx + movl $0x1, %ebp + movq %rdi, %r8 + decq %r8 + je bignum_modexp_montifend +bignum_modexp_montifloop: + adcq (%r13,%rbp,8), %rcx + sbbq %r10, %r10 + movq (%r12,%rbp,8), %rax + mulq %r9 + subq %r10, %rdx + addq %rcx, %rax + movq %rax, -0x8(%r13,%rbp,8) + movq %rdx, %rcx + incq %rbp + decq %r8 + jne bignum_modexp_montifloop +bignum_modexp_montifend: + adcq %rcx, %r11 + sbbq %r10, %r10 + movq %r11, -0x8(%r13,%rdi,8) + xorq %rbp, %rbp + xorq %rcx, %rcx +bignum_modexp_osloop: + movq (%r13,%rbp,8), %rax + movq (%r12,%rbp,8), %r9 + andq %r10, %r9 + negq %rcx + sbbq %r9, %rax + sbbq %rcx, %rcx + movq %rax, (%rsi,%rbp,8) + incq %rbp + cmpq %rdi, %rbp + jb bignum_modexp_osloop +bignum_modexp_amontifier_end: + popq %r13 + popq %r12 + popq %rbx + popq %rbp + ret + +// Local copy of bignum_amontmul + +bignum_modexp_local_amontmul: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x8, %rsp + testq %rdi, %rdi + je bignum_modexp_amont_end + movq %rdx, %r9 + movq (%r8), %rax + movq %rax, %rdx + movq %rax, %rbx + shlq $0x2, %rdx + subq %rdx, %rbx + xorq $0x2, %rbx + movq %rbx, %rdx + imulq %rax, %rdx + movl $0x2, %eax + addq %rdx, %rax + addq $0x1, %rdx + imulq %rax, %rbx + imulq %rdx, %rdx + movl $0x1, %eax + addq %rdx, %rax + imulq %rax, %rbx + imulq %rdx, %rdx + movl $0x1, %eax + addq %rdx, %rax + imulq %rax, %rbx + imulq %rdx, %rdx + movl $0x1, %eax + addq %rdx, %rax + imulq %rax, %rbx + movq %rbx, (%rsp) + xorq %r13, %r13 + xorq %rbx, %rbx +bignum_modexp_zoop: + movq %r13, (%rsi,%rbx,8) + incq %rbx + cmpq %rdi, %rbx + jb bignum_modexp_zoop + xorq %r14, %r14 +bignum_modexp_outeramontloop: + movq (%r9,%r13,8), %rbp + xorq %rbx, %rbx + xorq %r10, %r10 + xorq %r15, %r15 + movq %rdi, %r12 +bignum_modexp_maddloop: + adcq (%rsi,%rbx,8), %r10 + sbbq %r11, %r11 + movq (%rcx,%rbx,8), %rax + mulq %rbp + subq %r11, %rdx + addq %r10, %rax + movq %rax, (%rsi,%rbx,8) + movq %rdx, %r10 + incq %rbx + decq %r12 + jne bignum_modexp_maddloop + adcq %r10, %r14 + adcq %r15, %r15 + movq (%rsi), %r11 + movq (%rsp), %rbp + imulq %r11, %rbp + movq (%r8), %rax + mulq %rbp + addq %r11, %rax + movq %rdx, %r10 + movl $0x1, %ebx + movq %rdi, %r12 + decq %r12 + je bignum_modexp_montend +bignum_modexp_montloop: + adcq (%rsi,%rbx,8), %r10 + sbbq %r11, %r11 + movq (%r8,%rbx,8), %rax + mulq %rbp + subq %r11, %rdx + addq %r10, %rax + movq %rax, -0x8(%rsi,%rbx,8) + movq %rdx, %r10 + incq %rbx + decq %r12 + jne bignum_modexp_montloop +bignum_modexp_montend: + adcq %r14, %r10 + adcq $0x0, %r15 + movq %r15, %r14 + movq %r10, -0x8(%rsi,%rbx,8) + incq %r13 + cmpq %rdi, %r13 + jb bignum_modexp_outeramontloop + xorq %rbp, %rbp + subq %r14, %rbp + xorq %r11, %r11 + xorq %rbx, %rbx +bignum_modexp_acorrloop: + movq (%r8,%rbx,8), %rax + andq %rbp, %rax + negq %r11 + sbbq %rax, (%rsi,%rbx,8) + sbbq %r11, %r11 + incq %rbx + cmpq %rdi, %rbx + jb bignum_modexp_acorrloop +bignum_modexp_amont_end: + addq $0x8, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +// Local copy of bignum_demont + +bignum_modexp_local_demont: + pushq %rbx + pushq %rbp + pushq %r12 + testq %rdi, %rdi + je bignum_modexp_demont_end + movq (%rcx), %rax + movq %rax, %rbx + movq %rax, %r8 + shlq $0x2, %rbx + subq %rbx, %r8 + xorq $0x2, %r8 + movq %r8, %rbx + imulq %rax, %rbx + movl $0x2, %eax + addq %rbx, %rax + addq $0x1, %rbx + imulq %rax, %r8 + imulq %rbx, %rbx + movl $0x1, %eax + addq %rbx, %rax + imulq %rax, %r8 + imulq %rbx, %rbx + movl $0x1, %eax + addq %rbx, %rax + imulq %rax, %r8 + imulq %rbx, %rbx + movl $0x1, %eax + addq %rbx, %rax + imulq %rax, %r8 + xorq %rbx, %rbx +bignum_modexp_iloop: + movq (%rdx,%rbx,8), %rax + movq %rax, (%rsi,%rbx,8) + incq %rbx + cmpq %rdi, %rbx + jb bignum_modexp_iloop + xorq %r9, %r9 +bignum_modexp_outerdemontloop: + movq (%rsi), %r11 + movq %r8, %rbp + imulq %r11, %rbp + movq (%rcx), %rax + mulq %rbp + addq %r11, %rax + movq %rdx, %r10 + movl $0x1, %ebx + movq %rdi, %r12 + decq %r12 + je bignum_modexp_demontend +bignum_modexp_demontloop: + adcq (%rsi,%rbx,8), %r10 + sbbq %r11, %r11 + movq (%rcx,%rbx,8), %rax + mulq %rbp + subq %r11, %rdx + addq %r10, %rax + movq %rax, -0x8(%rsi,%rbx,8) + movq %rdx, %r10 + incq %rbx + decq %r12 + jne bignum_modexp_demontloop +bignum_modexp_demontend: + adcq $0x0, %r10 + movq %r10, -0x8(%rsi,%rbx,8) + incq %r9 + cmpq %rdi, %r9 + jb bignum_modexp_outerdemontloop + xorq %rbx, %rbx + movq %rdi, %r12 +bignum_modexp_cmploop: + movq (%rsi,%rbx,8), %rax + sbbq (%rcx,%rbx,8), %rax + incq %rbx + decq %r12 + jne bignum_modexp_cmploop + sbbq %rbp, %rbp + notq %rbp + xorq %r11, %r11 + xorq %rbx, %rbx +bignum_modexp_dcorrloop: + movq (%rcx,%rbx,8), %rax + andq %rbp, %rax + negq %r11 + sbbq %rax, (%rsi,%rbx,8) + sbbq %r11, %r11 + incq %rbx + cmpq %rdi, %rbx + jb bignum_modexp_dcorrloop +bignum_modexp_demont_end: + popq %r12 + popq %rbp + popq %rbx + ret + +// Local copy of bignum_mux + +bignum_modexp_local_mux: + testq %rsi, %rsi + je bignum_modexp_muxend + xorq %r9, %r9 + negq %rdi +bignum_modexp_muxloop: + movq (%rcx,%r9,8), %rax + movq (%r8,%r9,8), %rdi + cmovae %rdi, %rax + movq %rax, (%rdx,%r9,8) + incq %r9 + decq %rsi + jne bignum_modexp_muxloop +bignum_modexp_muxend: + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modifier.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modifier.S new file mode 100644 index 00000000000..35e01f3a5ba --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modifier.S @@ -0,0 +1,541 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compute "modification" constant z := 2^{64k} mod m +// Input m[k]; output z[k]; temporary buffer t[>=k] +// +// extern void bignum_modifier +// (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t); +// +// The last argument points to a temporary buffer t that should have size >= k. +// This is called "mod-ifier" because given any other k-digit number x we can +// get x MOD m simply and reasonably efficiently just by Montgomery +// multiplication of x and z. But one can also consider it the identity for +// Montgomery multiplication, assuming you have a reduced multiplier already. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = t +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = t +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modifier) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modifier) + .text + +#define k %rdi +#define z %rsi + +// These two inputs get moved to different places since RCX and RDX are special + +#define m %r12 +#define t %r13 + +// Other variables + +#define i %rbx +// Modular inverse; aliased to i, but we never use them together +#define w %rbx +#define j %rbp +// Matters that this is RAX for special use in multiplies +#define a %rax +// Matters that this is RDX for special use in multiplies +#define d %rdx +// Matters that this is RCX as CL=lo(c) is assumed in shifts +#define c %rcx +#define h %r11 +#define l %r10 +#define b %r9 +#define n %r8 + +// Some aliases for the values b and n + +#define q %r8 +#define r %r9 + +#define ashort %eax +#define ishort %ebx +#define jshort %ebp +#define qshort %r8d + + +S2N_BN_SYMBOL(bignum_modifier): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Save some additional registers for use, copy args out of RCX and RDX + + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + + movq %rdx, m + movq %rcx, t + +// If k = 0 the whole operation is trivial + + testq k, k + jz bignum_modifier_end + +// Copy the input m into the temporary buffer t. The temporary register +// c matters since we want it to hold the highest digit, ready for the +// normalization phase. + + xorq i, i +bignum_modifier_copyinloop: + movq (m,i,8), c + movq c, (t,i,8) + incq i + cmpq k, i + jc bignum_modifier_copyinloop + +// Do a rather stupid but constant-time digit normalization, conditionally +// shifting left (k-1) times based on whether the top word is zero. +// With careful binary striding this could be O(k*log(k)) instead of O(k^2) +// while still retaining the constant-time style. +// The "neg c" sets the zeroness predicate (~CF) for the entire inner loop + + movq k, i + decq i + jz bignum_modifier_normalized +bignum_modifier_normloop: + xorq j, j + movq k, h + negq c + movl $0, ashort +bignum_modifier_shufloop: + movq a, c + movq (t,j,8), a + cmovcq a, c + movq c, (t,j,8) + incq j + decq h + jnz bignum_modifier_shufloop + decq i + jnz bignum_modifier_normloop + +// We now have the top digit nonzero, assuming the input was nonzero, +// and as per the invariant of the loop above, c holds that digit. So +// now just count c's leading zeros and shift t bitwise that many bits. +// Note that we don't care about the result of bsr for zero inputs so +// the simple xor-ing with 63 is safe. + +bignum_modifier_normalized: + + bsrq c, c + xorq $63, c + + xorq b, b + xorq i, i +bignum_modifier_bitloop: + movq (t,i,8), a + movq a, j + shldq %cl, b, a + movq a, (t,i,8) + movq j, b + incq i + cmpq k, i + jc bignum_modifier_bitloop + +// Let h be the high word of n, which in all the in-scope cases is >= 2^63. +// Now successively form q = 2^i div h and r = 2^i mod h as i goes from +// 64 to 126. We avoid just using division out of constant-time concerns +// (at the least we would need to fix up h = 0 for out-of-scope inputs) and +// don't bother with Newton-Raphson, since this stupid simple loop doesn't +// contribute much of the overall runtime at typical sizes. + + movq -8(t,k,8), h + movl $1, qshort + movq h, r + negq r + movl $62, ishort +bignum_modifier_estloop: + + addq q, q + movq h, a + subq r, a + cmpq a, r // CF <=> r < h - r <=> 2 * r < h + sbbq a, a + notq a // a = bitmask(2 * r >= h) + subq a, q + addq r, r + andq h, a + subq a, r + decq i + jnz bignum_modifier_estloop + +// Strictly speaking the above loop doesn't quite give the true remainder +// and quotient in the special case r = h = 2^63, so fix it up. We get +// q = 2^63 - 1 and r = 2^63 and really want q = 2^63 and r = 0. This is +// supererogatory, because the main property of q used below still holds +// in this case unless the initial m = 1, and then anyway the overall +// specification (congruence modulo m) holds degenerately. But it seems +// nicer to get a "true" quotient and remainder. + + incq r + cmpq r, h + adcq $0, q + +// So now we have q and r with 2^126 = q * h + r (imagining r = 0 in the +// fixed-up case above: note that we never actually use the computed +// value of r below and so didn't adjust it). And we can assume the ranges +// q <= 2^63 and r < h < 2^64. +// +// The idea is to use q as a first quotient estimate for a remainder +// of 2^{p+62} mod n, where p = 64 * k. We have, splitting n into the +// high and low parts h and l: +// +// 2^{p+62} - q * n = 2^{p+62} - q * (2^{p-64} * h + l) +// = 2^{p+62} - (2^{p-64} * (q * h) + q * l) +// = 2^{p+62} - 2^{p-64} * (2^126 - r) - q * l +// = 2^{p-64} * r - q * l +// +// Note that 2^{p-64} * r < 2^{p-64} * h <= n +// and also q * l < 2^63 * 2^{p-64} = 2^{p-1} <= n +// so |diff| = |2^{p-64} * r - q * l| < n. +// +// If in fact diff >= 0 then it is already 2^{p+62} mod n. +// otherwise diff + n is the right answer. +// +// To (maybe?) make the computation slightly easier we actually flip +// the sign and compute d = q * n - 2^{p+62}. Then the answer is either +// -d (when negative) or n - d; in either case we effectively negate d. +// This negating tweak in fact spoils the result for cases where +// 2^{p+62} mod n = 0, when we get n instead. However the only case +// where this can happen is m = 1, when the whole spec holds trivially, +// and actually the remainder of the logic below works anyway since +// the latter part of the code only needs a congruence for the k-digit +// result, not strict modular reduction (the doublings will maintain +// the non-strict inequality). + + xorq c, c + xorq i, i +bignum_modifier_mulloop: + movq (t,i,8), a + mulq q + addq c, a + adcq $0, d + movq a, (z,i,8) + movq d, c + incq i + cmpq k, i + jc bignum_modifier_mulloop + +// Now c is the high word of the product, so subtract 2^62 +// and then turn it into a bitmask in q = h + + movq $0x4000000000000000, a + subq a, c + sbbq q, q + notq q + +// Now do [c] * n - d for our final answer + + xorq c, c + xorq i, i +bignum_modifier_remloop: + movq (t,i,8), a + andq q, a + negq c + sbbq (z,i,8), a + sbbq c, c + movq a, (z,i,8) + incq i + cmpq k, i + jc bignum_modifier_remloop + +// Now still need to do a couple of modular doublings to get us all the +// way up to 2^{p+64} == r from initial 2^{p+62} == r (mod n). + + xorq c, c + xorq j, j + xorq b, b +bignum_modifier_dubloop1: + movq (z,j,8), a + shrdq $63, a, c + negq b + sbbq (t,j,8), c + sbbq b, b + movq c, (z,j,8) + movq a, c + incq j + cmpq k, j + jc bignum_modifier_dubloop1 + shrq $63, c + addq b, c + xorq j, j + xorq b, b +bignum_modifier_corrloop1: + movq (t,j,8), a + andq c, a + negq b + adcq (z,j,8), a + sbbq b, b + movq a, (z,j,8) + incq j + cmpq k, j + jc bignum_modifier_corrloop1 + +// This is not exactly the same: we also copy output to t giving the +// initialization t_1 = r == 2^{p+64} mod n for the main loop next. + + xorq c, c + xorq j, j + xorq b, b +bignum_modifier_dubloop2: + movq (z,j,8), a + shrdq $63, a, c + negq b + sbbq (t,j,8), c + sbbq b, b + movq c, (z,j,8) + movq a, c + incq j + cmpq k, j + jc bignum_modifier_dubloop2 + shrq $63, c + addq b, c + xorq j, j + xorq b, b +bignum_modifier_corrloop2: + movq (t,j,8), a + andq c, a + negq b + adcq (z,j,8), a + sbbq b, b + movq a, (z,j,8) + movq a, (t,j,8) + incq j + cmpq k, j + jc bignum_modifier_corrloop2 + +// We then successively generate (k+1)-digit values satisfying +// t_i == 2^{p+64*i} mod n, each of which is stored in h::t. Finish +// initialization by zeroing h initially + + xorq h, h + +// Then if t_i = 2^{p} * h + l +// we have t_{i+1} == 2^64 * t_i +// = (2^{p+64} * h) + (2^64 * l) +// == r * h + l<<64 +// Do this k more times so we end up == 2^{128*k+64}, one more than we want +// +// Writing B = 2^{64k}, the possible correction of adding r, which for +// a (k+1)-digit result is equivalent to subtracting q = 2^{64*(k+1)} - r +// would give the overall worst-case value minus q of +// [ B * (B^k - 1) + (B - 1) * r ] - [B^{k+1} - r] +// = B * (r - 1) < B^{k+1} so we keep inside k+1 digits as required. +// +// This implementation makes the shift implicit by starting b with the +// "previous" digit (initially 0) to offset things by 1. + + movq k, i +bignum_modifier_modloop: + xorq b, b + movq k, n + xorq j, j + xorq c, c +bignum_modifier_cmaloop: + adcq b, c + sbbq l, l + movq (z,j,8), a + mulq h + subq l, d + addq c, a + movq (t,j,8), b + movq a, (t,j,8) + movq d, c + incq j + decq n + jnz bignum_modifier_cmaloop + adcq c, b + movq b, h + + sbbq l, l + + xorq j, j + xorq c, c +bignum_modifier_oaloop: + movq (t,j,8), a + movq (z,j,8), b + andq l, b + negq c + adcq b, a + sbbq c, c + movq a, (t,j,8) + incq j + cmpq k, j + jc bignum_modifier_oaloop + subq c, h + + decq i + jnz bignum_modifier_modloop + +// Compute the negated modular inverse w (same register as i, not used again). + + movq (m), a + movq a, c + movq a, w + shlq $2, c + subq c, w + xorq $2, w + movq w, c + imulq a, c + movl $2, ashort + addq c, a + addq $1, c + imulq a, w + imulq c, c + movl $1, ashort + addq c, a + imulq a, w + imulq c, c + movl $1, ashort + addq c, a + imulq a, w + imulq c, c + movl $1, ashort + addq c, a + imulq a, w + +// Now do one almost-Montgomery reduction w.r.t. the original m +// which lops off one 2^64 from the congruence and, with the usual +// almost-Montgomery correction, gets us back inside k digits + + movq (t), c + movq w, b + imulq c, b + + movq (m), a + mulq b + addq c, a + movq d, c + movl $1, jshort + movq k, n + decq n + jz bignum_modifier_amontend +bignum_modifier_amontloop: + adcq (t,j,8), c + sbbq l, l + movq (m,j,8), a + mulq b + subq l, d + addq c, a + movq a, -8(t,j,8) + movq d, c + incq j + decq n + jnz bignum_modifier_amontloop +bignum_modifier_amontend: + adcq c, h + sbbq l, l + movq h, -8(t,k,8) + + xorq j, j + xorq c, c +bignum_modifier_aosloop: + movq (t,j,8), a + movq (m,j,8), b + andq l, b + negq c + sbbq b, a + sbbq c, c + movq a, (z,j,8) + incq j + cmpq k, j + jc bignum_modifier_aosloop + +// So far, the code (basically the same as bignum_amontifier) has produced +// a k-digit value z == 2^{128k} (mod m), not necessarily fully reduced mod m. +// We now do a short Montgomery reduction (similar to bignum_demont) so that +// we achieve full reduction mod m while lopping 2^{64k} off the congruence. +// We recycle h as the somewhat strangely-named outer loop counter. + + movq k, h + +bignum_modifier_montouterloop: + movq (z), c + movq w, b + imulq c, b + movq (m), a + mulq b + addq c, a + movq d, c + movl $1, jshort + movq k, n + decq n + jz bignum_modifier_montend +bignum_modifier_montloop: + adcq (z,j,8), c + sbbq l, l + movq (m,j,8), a + mulq b + subq l, d + addq c, a + movq a, -8(z,j,8) + movq d, c + incq j + decq n + jnz bignum_modifier_montloop +bignum_modifier_montend: + adcq $0, c + movq c, -8(z,k,8) + + decq h + jnz bignum_modifier_montouterloop + +// Now do a comparison of z with m to set a final correction mask +// indicating that z >= m and so we need to subtract m. + + xorq j, j + movq k, n +bignum_modifier_cmploop: + movq (z,j,8), a + sbbq (m,j,8), a + incq j + decq n + jnz bignum_modifier_cmploop + sbbq d, d + notq d + +// Now do a masked subtraction of m for the final reduced result. + + xorq l, l + xorq j, j +bignum_modifier_corrloop: + movq (m,j,8), a + andq d, a + negq l + sbbq a, (z,j,8) + sbbq l, l + incq j + cmpq k, j + jc bignum_modifier_corrloop + +bignum_modifier_end: + popq %r13 + popq %r12 + popq %rbx + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modinv.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modinv.S new file mode 100644 index 00000000000..f343ddd8942 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modinv.S @@ -0,0 +1,709 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Invert modulo m, z = (1/a) mod b, assuming b is an odd number > 1, coprime a +// Inputs a[k], b[k]; output z[k]; temporary buffer t[>=3*k] +// +// extern void bignum_modinv +// (uint64_t k, uint64_t *z, uint64_t *a, uint64_t *b, uint64_t *t); +// +// k-digit (digit=64 bits) "z := a^-1 mod b" (modular inverse of a modulo b) +// using t as a temporary buffer (t at least 3*k words = 24*k bytes), and +// assuming that a and b are coprime *and* that b is an odd number > 1. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = a, RCX = b, R8 = t +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = a, R9 = b, [RSP+40] = t +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modinv) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modinv) + .text + +// We get CHUNKSIZE bits per outer iteration, 64 minus a few for proxy errors + +#define CHUNKSIZE 58 + +// These variables are so fundamental we keep them consistently in registers. +// k actually stays where it was at the beginning, while l gets set up later + +#define k %rdi +#define l %r13 + +// These are kept on the stack since there aren't enough registers + +#define mat_mm (%rsp) +#define mat_mn 8(%rsp) +#define mat_nm 16(%rsp) +#define mat_nn 24(%rsp) +#define t 32(%rsp) +// Modular inverse +#define v 40(%rsp) +// We reconstruct n as m + 8*k as needed +#define m 48(%rsp) +#define w 56(%rsp) +#define z 64(%rsp) +// Original b pointer, not b the temp +#define bm 72(%rsp) + +#define STACKVARSIZE 80 + +// These get set to m/n or w/z during the cross-multiplications etc. +// Otherwise they can be used as additional temporaries + +#define p1 %r8 +#define p2 %r15 + +// These are shorthands for common temporary registers + +#define a %rax +#define b %rbx +#define c %rcx +#define d %rdx +#define i %r9 + +// Temporaries for the top proxy selection part + +#define c1 %r10 +#define c2 %r11 +#define h1 %r12 +#define h2 %rbp +#define l1 %r14 +#define l2 %rsi + +// Re-use for the actual proxies; m_hi = h1 and n_hi = h2 are assumed + +#define m_hi %r12 +#define n_hi %rbp +#define m_lo %r14 +#define n_lo %rsi + +// Re-use for the matrix entries in the inner loop, though they +// get spilled to the corresponding memory locations mat_... + +#define m_m %r10 +#define m_n %r11 +#define n_m %rcx +#define n_n %rdx + +#define ashort %eax +#define ishort %r9d +#define m_mshort %r10d +#define m_nshort %r11d +#define n_mshort %ecx +#define n_nshort %edx + +S2N_BN_SYMBOL(bignum_modinv): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// Save all required registers and make room on stack for all the above vars + + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $STACKVARSIZE, %rsp + +// If k = 0 then do nothing (this is out of scope anyway) + + testq k, k + jz bignum_modinv_end + +// Set up the additional two buffers m and n beyond w in temp space +// and record all pointers m, n, w and z in stack-based variables + + movq %rsi, z + movq %r8, w + movq %rcx, bm + leaq (%r8,k,8), %r10 + movq %r10, m + leaq (%r10,k,8), p2 + +// Initialize the main buffers with their starting values: +// m = a, n = b, w = b (to be tweaked to b - 1) and z = 0 + + xorq %r11, %r11 + xorq i, i +bignum_modinv_copyloop: + movq (%rdx,i,8), a + movq (%rcx,i,8), b + movq a, (%r10,i,8) + movq b, (p2,i,8) + movq b, (%r8,i,8) + movq %r11, (%rsi,i,8) + incq i + cmpq k, i + jc bignum_modinv_copyloop + +// Tweak down w to b - 1 (this crude approach is safe as b needs to be odd +// for it to be in scope). We have then established the congruence invariant: +// +// a * w == -m (mod b) +// a * z == n (mod b) +// +// This, with the bounds w <= b and z <= b, is maintained round the outer loop + + movq (%r8), a + movq a, b + decq b + movq b, (%r8) + +// Compute v = negated modular inverse of b mod 2^64, reusing a from above +// This is used for Montgomery reduction operations each time round the loop + + movq a, h2 + movq a, h1 + shlq $2, h2 + subq h2, h1 + xorq $2, h1 + + movq h1, h2 + imulq a, h2 + movl $2, ashort + addq h2, a + addq $1, h2 + + imulq a, h1 + + imulq h2, h2 + movl $1, ashort + addq h2, a + imulq a, h1 + + imulq h2, h2 + movl $1, ashort + addq h2, a + imulq a, h1 + + imulq h2, h2 + movl $1, ashort + addq h2, a + imulq a, h1 + + movq h1, v + +// Set up the outer loop count of 128 * k +// The invariant is that m * n < 2^t at all times. + + movq k, a + shlq $7, a + movq a, t + +// Start of the main outer loop iterated t / CHUNKSIZE times + +bignum_modinv_outerloop: + +// We need only bother with sharper l = min k (ceil(t/64)) digits +// for the computations on m and n (but we still need k for w and z). +// Either both m and n fit in l digits, or m has become zero and so +// nothing happens in the loop anyway and this makes no difference. + + movq t, l + addq $63, l + shrq $6, l + cmpq k, l + cmovncq k, l + +// Select upper and lower proxies for both m and n to drive the inner +// loop. The lower proxies are simply the lowest digits themselves, +// m_lo = m[0] and n_lo = n[0], while the upper proxies are bitfields +// of the two inputs selected so their top bit (63) aligns with the +// most significant bit of *either* of the two inputs. + + xorq h1, h1 // Previous high and low for m + xorq l1, l1 + xorq h2, h2 // Previous high and low for n + xorq l2, l2 + xorq c2, c2 // Mask flag: previous word of one was nonzero + // and in this case h1 and h2 are those words + + movq m, p1 + leaq (p1,k,8), p2 + xorq i, i +bignum_modinv_toploop: + movq (p1,i,8), b + movq (p2,i,8), c + movq c2, c1 + andq h1, c1 + andq h2, c2 + movq b, a + orq c, a + negq a + cmovcq c1, l1 + cmovcq c2, l2 + cmovcq b, h1 + cmovcq c, h2 + sbbq c2, c2 + incq i + cmpq l, i + jc bignum_modinv_toploop + + movq h1, a + orq h2, a + bsrq a, c + xorq $63, c + shldq %cl, l1, h1 + shldq %cl, l2, h2 + +// m_lo = m[0], n_lo = n[0]; + + movq (p1), %rax + movq %rax, m_lo + + movq (p2), %rax + movq %rax, n_lo + +// Now the inner loop, with i as loop counter from CHUNKSIZE down. +// This records a matrix of updates to apply to the initial +// values of m and n with, at stage j: +// +// sgn * m' = (m_m * m - m_n * n) / 2^j +// -sgn * n' = (n_m * m - n_n * n) / 2^j +// +// where "sgn" is either +1 or -1, and we lose track of which except +// that both instance above are the same. This throwing away the sign +// costs nothing (since we have to correct in general anyway because +// of the proxied comparison) and makes things a bit simpler. But it +// is simply the parity of the number of times the first condition, +// used as the swapping criterion, fires in this loop. + + movl $1, m_mshort + movl $0, m_nshort + movl $0, n_mshort + movl $1, n_nshort + movl $CHUNKSIZE, ishort + +// Stash more variables over the inner loop to free up regs + + movq k, mat_mn + movq l, mat_nm + movq p1, mat_mm + movq p2, mat_nn + +// Conceptually in the inner loop we follow these steps: +// +// * If m_lo is odd and m_hi < n_hi, then swap the four pairs +// (m_hi,n_hi); (m_lo,n_lo); (m_m,n_m); (m_n,n_n) +// +// * Now, if m_lo is odd (old or new, doesn't matter as initial n_lo is odd) +// m_hi := m_hi - n_hi, m_lo := m_lo - n_lo +// m_m := m_m + n_m, m_n := m_n + n_n +// +// * Halve and double them +// m_hi := m_hi / 2, m_lo := m_lo / 2 +// n_m := n_m * 2, n_n := n_n * 2 +// +// The actual computation computes updates before actually swapping and +// then corrects as needed. + +bignum_modinv_innerloop: + + xorl %eax, %eax + xorl %ebx, %ebx + xorq p1, p1 + xorq p2, p2 + btq $0, m_lo + + cmovcq n_hi, %rax + cmovcq n_lo, %rbx + cmovcq n_m, p1 + cmovcq n_n, p2 + + movq m_lo, l + subq %rbx, m_lo + subq l, %rbx + movq m_hi, k + subq %rax, k + cmovcq m_hi, n_hi + leaq -1(k), m_hi + cmovcq %rbx, m_lo + cmovcq l, n_lo + notq m_hi + cmovcq m_m, n_m + cmovcq m_n, n_n + cmovncq k, m_hi + + shrq $1, m_lo + addq p1, m_m + addq p2, m_n + shrq $1, m_hi + addq n_m, n_m + addq n_n, n_n + +// End of the inner for-loop + + decq i + jnz bignum_modinv_innerloop + +// Unstash the temporary variables + + movq mat_mn, k + movq mat_nm, l + movq mat_mm, p1 + movq mat_nn, p2 + +// Put the matrix entries in memory since we're out of registers +// We pull them out repeatedly in the next loop + + movq m_m, mat_mm + movq m_n, mat_mn + movq n_m, mat_nm + movq n_n, mat_nn + +// Apply the update to w and z, using addition in this case, and also take +// the chance to shift an additional 6 = 64-CHUNKSIZE bits to be ready for a +// Montgomery multiplication. Because we know that m_m + m_n <= 2^58 and +// w, z <= b < 2^{64k}, we know that both of these fit in k+1 words. +// We do this before the m-n update to allow us to play with c1 and c2 here. +// +// l1::w = 2^6 * (m_m * w + m_n * z) +// l2::z = 2^6 * (n_m * w + n_n * z) +// +// with c1 and c2 recording previous words for the shifting part + + movq w, p1 + movq z, p2 + xorq l1, l1 + xorq l2, l2 + xorq c1, c1 + xorq c2, c2 + xorq i, i +bignum_modinv_congloop: + + movq (p1,i,8), c + movq mat_mm, a + mulq c + addq a, l1 + adcq $0, d + movq d, h1 // Now h1::l1 := m_m * w + l1_in + + movq mat_nm, a + mulq c + addq a, l2 + adcq $0, d + movq d, h2 // Now h2::l2 := n_m * w + l2_in + + movq (p2,i,8), c + movq mat_mn, a + mulq c + addq a, l1 + adcq d, h1 // h1::l1 := m_m * w + m_n * z + l1_in + shrdq $CHUNKSIZE, l1, c1 + movq c1, (p1,i,8) + movq l1, c1 + movq h1, l1 + + movq mat_nn, a + mulq c + addq a, l2 + adcq d, h2 // h2::l2 := n_m * w + n_n * z + l2_in + shrdq $CHUNKSIZE, l2, c2 + movq c2, (p2,i,8) + movq l2, c2 + movq h2, l2 + + incq i + cmpq k, i + jc bignum_modinv_congloop + + shldq $64-CHUNKSIZE, c1, l1 + shldq $64-CHUNKSIZE, c2, l2 + +// Do a Montgomery reduction of l1::w + + movq bm, p2 + + movq (p1), b + movq v, h1 + imulq b, h1 + movq (p2), a + mulq h1 + addq b, a // Will be zero but want the carry + movq %rdx, c1 + movl $1, ishort + movq k, c + decq c + jz bignum_modinv_wmontend + +bignum_modinv_wmontloop: + adcq (p1,i,8), c1 + sbbq b, b + movq (p2,i,8), a + mulq h1 + subq b, %rdx + addq c1, a + movq a, -8(p1,i,8) + movq %rdx, c1 + incq i + decq c + jnz bignum_modinv_wmontloop + +bignum_modinv_wmontend: + adcq l1, c1 + movq c1, -8(p1,k,8) + sbbq c1, c1 + negq c1 + + movq k, c + xorq i, i +bignum_modinv_wcmploop: + movq (p1,i,8), a + sbbq (p2,i,8), a + incq i + decq c + jnz bignum_modinv_wcmploop + sbbq $0, c1 + sbbq c1, c1 + notq c1 + + xorq c, c + xorq i, i +bignum_modinv_wcorrloop: + movq (p1,i,8), a + movq (p2,i,8), b + andq c1, b + negq c + sbbq b, a + sbbq c, c + movq a, (p1,i,8) + incq i + cmpq k, i + jc bignum_modinv_wcorrloop + +// Do a Montgomery reduction of l2::z + + movq z, p1 + + movq (p1), b + movq v, h2 + imulq b, h2 + movq (p2), a + mulq h2 + addq b, a // Will be zero but want the carry + movq %rdx, c2 + movl $1, ishort + movq k, c + decq c + jz bignum_modinv_zmontend + +bignum_modinv_zmontloop: + adcq (p1,i,8), c2 + sbbq b, b + movq (p2,i,8), a + mulq h2 + subq b, %rdx + addq c2, a + movq a, -8(p1,i,8) + movq %rdx, c2 + incq i + decq c + jnz bignum_modinv_zmontloop + +bignum_modinv_zmontend: + adcq l2, c2 + movq c2, -8(p1,k,8) + sbbq c2, c2 + negq c2 + + movq k, c + xorq i, i +bignum_modinv_zcmploop: + movq (p1,i,8), a + sbbq (p2,i,8), a + incq i + decq c + jnz bignum_modinv_zcmploop + sbbq $0, c2 + sbbq c2, c2 + notq c2 + + xorq c, c + xorq i, i +bignum_modinv_zcorrloop: + movq (p1,i,8), a + movq (p2,i,8), b + andq c2, b + negq c + sbbq b, a + sbbq c, c + movq a, (p1,i,8) + incq i + cmpq k, i + jc bignum_modinv_zcorrloop + +// Now actually compute the updates to m and n corresponding to the matrix, +// and correct the signs if they have gone negative. First we compute the +// (k+1)-sized updates with the following invariant (here h1 and h2 are in +// fact carry bitmasks, either 0 or -1): +// +// h1::l1::m = m_m * m - m_n * n +// h2::l2::n = n_m * m - n_n * n + + movq m, p1 + leaq (p1,k,8), p2 + xorq i, i + xorq h1, h1 + xorq l1, l1 + xorq h2, h2 + xorq l2, l2 +bignum_modinv_crossloop: + + movq (p1,i,8), c + movq mat_mm, a + mulq c + addq a, l1 + adcq $0, d + movq d, c1 // Now c1::l1 is +ve part 1 + + movq mat_nm, a + mulq c + addq a, l2 + adcq $0, d + movq d, c2 // Now c2::l2 is +ve part 2 + + movq (p2,i,8), c + movq mat_mn, a + mulq c + subq h1, d // Now d::a is -ve part 1 + + subq a, l1 + sbbq d, c1 + sbbq h1, h1 + movq l1, (p1,i,8) + movq c1, l1 + + movq mat_nn, a + mulq c + subq h2, d // Now d::a is -ve part 2 + + subq a, l2 + sbbq d, c2 + sbbq h2, h2 + movq l2, (p2,i,8) + movq c2, l2 + + incq i + cmpq l, i + jc bignum_modinv_crossloop + +// Now fix the signs of m and n if they have gone negative + + xorq i, i + movq h1, c1 // carry-in coded up as well + movq h2, c2 // carry-in coded up as well + xorq h1, l1 // for the bignum_modinv_end digit + xorq h2, l2 // for the bignum_modinv_end digit +bignum_modinv_optnegloop: + movq (p1,i,8), a + xorq h1, a + negq c1 + adcq $0, a + sbbq c1, c1 + movq a, (p1,i,8) + movq (p2,i,8), a + xorq h2, a + negq c2 + adcq $0, a + sbbq c2, c2 + movq a, (p2,i,8) + incq i + cmpq l, i + jc bignum_modinv_optnegloop + subq c1, l1 + subq c2, l2 + +// Now shift them right CHUNKSIZE bits + + movq l, i +bignum_modinv_shiftloop: + movq -8(p1,i,8), a + movq a, c1 + shrdq $CHUNKSIZE, l1, a + movq a, -8(p1,i,8) + movq c1, l1 + movq -8(p2,i,8), a + movq a, c2 + shrdq $CHUNKSIZE, l2, a + movq a, -8(p2,i,8) + movq c2, l2 + decq i + jnz bignum_modinv_shiftloop + +// Finally, use the signs h1 and h2 to do optional modular negations of +// w and z respectively, flipping h2 to make signs work. We don't make +// any checks for zero values, but we certainly retain w <= b and z <= b. +// This is enough for the Montgomery step in the next iteration to give +// strict reduction w < b amd z < b, and anyway when we terminate we +// could not have z = b since it violates the coprimality assumption for +// in-scope cases. + + notq h2 + movq bm, c + movq w, p1 + movq z, p2 + movq h1, c1 + movq h2, c2 + xorq i, i +bignum_modinv_fliploop: + movq h2, d + movq (c,i,8), a + andq a, d + andq h1, a + movq (p1,i,8), b + xorq h1, b + negq c1 + adcq b, a + sbbq c1, c1 + movq a, (p1,i,8) + movq (p2,i,8), b + xorq h2, b + negq c2 + adcq b, d + sbbq c2, c2 + movq d, (p2,i,8) + incq i + cmpq k, i + jc bignum_modinv_fliploop + +// End of main loop. We can stop if t' <= 0 since then m * n < 2^0, which +// since n is odd and m and n are coprime (in the in-scope cases) means +// m = 0, n = 1 and hence from the congruence invariant a * z == 1 (mod b). +// Moreover we do in fact need to maintain strictly t > 0 in the main loop, +// or the computation of the optimized digit bound l could collapse to 0. + + subq $CHUNKSIZE, t + jnbe bignum_modinv_outerloop + +bignum_modinv_end: + addq $STACKVARSIZE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modoptneg.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modoptneg.S new file mode 100644 index 00000000000..b575d00127b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modoptneg.S @@ -0,0 +1,97 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally negate modulo m, z := (-x) mod m (if p nonzero) or z := x +// (if p zero), assuming x reduced +// Inputs p, x[k], m[k]; output z[k] +// +// extern void bignum_modoptneg +// (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x, uint64_t *m); +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = p, RCX = x, R8 = m +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = p, R9 = x, [RSP+40] = m +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modoptneg) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modoptneg) + .text + +#define k %rdi +#define z %rsi +#define p %rdx +#define x %rcx +#define m %r8 + +#define a %r9 +#define c %rax +#define b %r10 +#define i %r11 + +S2N_BN_SYMBOL(bignum_modoptneg): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// Do nothing if k = 0 + + testq k, k + jz bignum_modoptneg_end + +// Make an additional check for zero input, and force p to zero in this case. +// This can be skipped if the input is known not to be zero a priori. + + xorq i, i + xorq a, a +bignum_modoptneg_cmploop: + orq (x,i,8), a + incq i + cmpq k, i + jc bignum_modoptneg_cmploop + + cmpq $0, a + cmovzq a, p + +// Turn the input p into a strict bitmask + + negq p + sbbq p, p + +// Main loop + + xorq i, i + movq p, c +bignum_modoptneg_mainloop: + movq (m,i,8), a + andq p, a + movq (x,i,8), b + xorq p, b + negq c + adcq b, a + sbbq c, c + movq a, (z,i,8) + incq i + cmpq k, i + jc bignum_modoptneg_mainloop + +bignum_modoptneg_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modsub.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modsub.S new file mode 100644 index 00000000000..738a1bbb190 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_modsub.S @@ -0,0 +1,86 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Subtract modulo m, z := (x - y) mod m, assuming x and y reduced +// Inputs x[k], y[k], m[k]; output z[k] +// +// extern void bignum_modsub +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_modsub) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_modsub) + .text + +#define k %rdi +#define z %rsi +#define x %rdx +#define y %rcx +#define m %r8 +#define i %r9 +#define j %r10 +#define a %rax +#define c %r11 + +S2N_BN_SYMBOL(bignum_modsub): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// If k = 0 do nothing + + testq k, k + jz bignum_modsub_end + +// Subtract z := x - y and record a mask for the carry x - y < 0 + + xorq c, c + movq k, j + xorq i, i +bignum_modsub_subloop: + movq (x,i,8), a + sbbq (y,i,8), a + movq a, (z,i,8) + incq i + decq j + jnz bignum_modsub_subloop + sbbq c, c + +// Now do a masked addition z := z + [c] * m + + xorq i, i +bignum_modsub_addloop: + movq (m,i,8), a + andq c, a + negq j + adcq a, (z,i,8) + sbbq j, j + incq i + cmpq k, i + jc bignum_modsub_addloop + +bignum_modsub_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montifier.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montifier.S new file mode 100644 index 00000000000..c14035c15f0 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montifier.S @@ -0,0 +1,540 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Compute "montification" constant z := 2^{128k} mod m +// Input m[k]; output z[k]; temporary buffer t[>=k] +// +// extern void bignum_montifier +// (uint64_t k, uint64_t *z, uint64_t *m, uint64_t *t); +// +// The last argument points to a temporary buffer t that should have size >= k. +// This is called "montifier" because given any other k-digit number x, +// whether or not it's reduced modulo m, it can be mapped to its Montgomery +// representation (2^{64k} * x) mod m just by Montgomery multiplication by z. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = t +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = t +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montifier) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montifier) + .text + +#define k %rdi +#define z %rsi + +// These two inputs get moved to different places since RCX and RDX are special + +#define m %r12 +#define t %r13 + +// Other variables + +#define i %rbx +// Modular inverse; aliased to i, but we never use them together +#define w %rbx +#define j %rbp +// Matters that this is RAX for special use in multiplies +#define a %rax +// Matters that this is RDX for special use in multiplies +#define d %rdx +// Matters that this is RCX as CL=lo(c) is assumed in shifts +#define c %rcx +#define h %r11 +#define l %r10 +#define b %r9 +#define n %r8 + +// Some aliases for the values b and n + +#define q %r8 +#define r %r9 + +#define ashort %eax +#define ishort %ebx +#define jshort %ebp +#define qshort %r8d + + +S2N_BN_SYMBOL(bignum_montifier): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Save some additional registers for use, copy args out of RCX and RDX + + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + + movq %rdx, m + movq %rcx, t + +// If k = 0 the whole operation is trivial + + testq k, k + jz bignum_montifier_end + +// Copy the input m into the temporary buffer t. The temporary register +// c matters since we want it to hold the highest digit, ready for the +// normalization phase. + + xorq i, i +bignum_montifier_copyinloop: + movq (m,i,8), c + movq c, (t,i,8) + incq i + cmpq k, i + jc bignum_montifier_copyinloop + +// Do a rather stupid but constant-time digit normalization, conditionally +// shifting left (k-1) times based on whether the top word is zero. +// With careful binary striding this could be O(k*log(k)) instead of O(k^2) +// while still retaining the constant-time style. +// The "neg c" sets the zeroness predicate (~CF) for the entire inner loop + + movq k, i + decq i + jz bignum_montifier_normalized +bignum_montifier_normloop: + xorq j, j + movq k, h + negq c + movl $0, ashort +bignum_montifier_shufloop: + movq a, c + movq (t,j,8), a + cmovcq a, c + movq c, (t,j,8) + incq j + decq h + jnz bignum_montifier_shufloop + decq i + jnz bignum_montifier_normloop + +// We now have the top digit nonzero, assuming the input was nonzero, +// and as per the invariant of the loop above, c holds that digit. So +// now just count c's leading zeros and shift t bitwise that many bits. +// Note that we don't care about the result of bsr for zero inputs so +// the simple xor-ing with 63 is safe. + +bignum_montifier_normalized: + + bsrq c, c + xorq $63, c + + xorq b, b + xorq i, i +bignum_montifier_bitloop: + movq (t,i,8), a + movq a, j + shldq %cl, b, a + movq a, (t,i,8) + movq j, b + incq i + cmpq k, i + jc bignum_montifier_bitloop + +// Let h be the high word of n, which in all the in-scope cases is >= 2^63. +// Now successively form q = 2^i div h and r = 2^i mod h as i goes from +// 64 to 126. We avoid just using division out of constant-time concerns +// (at the least we would need to fix up h = 0 for out-of-scope inputs) and +// don't bother with Newton-Raphson, since this stupid simple loop doesn't +// contribute much of the overall runtime at typical sizes. + + movq -8(t,k,8), h + movl $1, qshort + movq h, r + negq r + movl $62, ishort +bignum_montifier_estloop: + + addq q, q + movq h, a + subq r, a + cmpq a, r // CF <=> r < h - r <=> 2 * r < h + sbbq a, a + notq a // a = bitmask(2 * r >= h) + subq a, q + addq r, r + andq h, a + subq a, r + decq i + jnz bignum_montifier_estloop + +// Strictly speaking the above loop doesn't quite give the true remainder +// and quotient in the special case r = h = 2^63, so fix it up. We get +// q = 2^63 - 1 and r = 2^63 and really want q = 2^63 and r = 0. This is +// supererogatory, because the main property of q used below still holds +// in this case unless the initial m = 1, and then anyway the overall +// specification (congruence modulo m) holds degenerately. But it seems +// nicer to get a "true" quotient and remainder. + + incq r + cmpq r, h + adcq $0, q + +// So now we have q and r with 2^126 = q * h + r (imagining r = 0 in the +// fixed-up case above: note that we never actually use the computed +// value of r below and so didn't adjust it). And we can assume the ranges +// q <= 2^63 and r < h < 2^64. +// +// The idea is to use q as a first quotient estimate for a remainder +// of 2^{p+62} mod n, where p = 64 * k. We have, splitting n into the +// high and low parts h and l: +// +// 2^{p+62} - q * n = 2^{p+62} - q * (2^{p-64} * h + l) +// = 2^{p+62} - (2^{p-64} * (q * h) + q * l) +// = 2^{p+62} - 2^{p-64} * (2^126 - r) - q * l +// = 2^{p-64} * r - q * l +// +// Note that 2^{p-64} * r < 2^{p-64} * h <= n +// and also q * l < 2^63 * 2^{p-64} = 2^{p-1} <= n +// so |diff| = |2^{p-64} * r - q * l| < n. +// +// If in fact diff >= 0 then it is already 2^{p+62} mod n. +// otherwise diff + n is the right answer. +// +// To (maybe?) make the computation slightly easier we actually flip +// the sign and compute d = q * n - 2^{p+62}. Then the answer is either +// -d (when negative) or n - d; in either case we effectively negate d. +// This negating tweak in fact spoils the result for cases where +// 2^{p+62} mod n = 0, when we get n instead. However the only case +// where this can happen is m = 1, when the whole spec holds trivially, +// and actually the remainder of the logic below works anyway since +// the latter part of the code only needs a congruence for the k-digit +// result, not strict modular reduction (the doublings will maintain +// the non-strict inequality). + + xorq c, c + xorq i, i +bignum_montifier_mulloop: + movq (t,i,8), a + mulq q + addq c, a + adcq $0, d + movq a, (z,i,8) + movq d, c + incq i + cmpq k, i + jc bignum_montifier_mulloop + +// Now c is the high word of the product, so subtract 2^62 +// and then turn it into a bitmask in q = h + + movq $0x4000000000000000, a + subq a, c + sbbq q, q + notq q + +// Now do [c] * n - d for our final answer + + xorq c, c + xorq i, i +bignum_montifier_remloop: + movq (t,i,8), a + andq q, a + negq c + sbbq (z,i,8), a + sbbq c, c + movq a, (z,i,8) + incq i + cmpq k, i + jc bignum_montifier_remloop + +// Now still need to do a couple of modular doublings to get us all the +// way up to 2^{p+64} == r from initial 2^{p+62} == r (mod n). + + xorq c, c + xorq j, j + xorq b, b +bignum_montifier_dubloop1: + movq (z,j,8), a + shrdq $63, a, c + negq b + sbbq (t,j,8), c + sbbq b, b + movq c, (z,j,8) + movq a, c + incq j + cmpq k, j + jc bignum_montifier_dubloop1 + shrq $63, c + addq b, c + xorq j, j + xorq b, b +bignum_montifier_corrloop1: + movq (t,j,8), a + andq c, a + negq b + adcq (z,j,8), a + sbbq b, b + movq a, (z,j,8) + incq j + cmpq k, j + jc bignum_montifier_corrloop1 + +// This is not exactly the same: we also copy output to t giving the +// initialization t_1 = r == 2^{p+64} mod n for the main loop next. + + xorq c, c + xorq j, j + xorq b, b +bignum_montifier_dubloop2: + movq (z,j,8), a + shrdq $63, a, c + negq b + sbbq (t,j,8), c + sbbq b, b + movq c, (z,j,8) + movq a, c + incq j + cmpq k, j + jc bignum_montifier_dubloop2 + shrq $63, c + addq b, c + xorq j, j + xorq b, b +bignum_montifier_corrloop2: + movq (t,j,8), a + andq c, a + negq b + adcq (z,j,8), a + sbbq b, b + movq a, (z,j,8) + movq a, (t,j,8) + incq j + cmpq k, j + jc bignum_montifier_corrloop2 + +// We then successively generate (k+1)-digit values satisfying +// t_i == 2^{p+64*i} mod n, each of which is stored in h::t. Finish +// initialization by zeroing h initially + + xorq h, h + +// Then if t_i = 2^{p} * h + l +// we have t_{i+1} == 2^64 * t_i +// = (2^{p+64} * h) + (2^64 * l) +// == r * h + l<<64 +// Do this 2*k more times so we end up == 2^{192*k+64}, one more than we want +// +// Writing B = 2^{64k}, the possible correction of adding r, which for +// a (k+1)-digit result is equivalent to subtracting q = 2^{64*(k+1)} - r +// would give the overall worst-case value minus q of +// [ B * (B^k - 1) + (B - 1) * r ] - [B^{k+1} - r] +// = B * (r - 1) < B^{k+1} so we keep inside k+1 digits as required. +// +// This implementation makes the shift implicit by starting b with the +// "previous" digit (initially 0) to offset things by 1. + + leaq (k,k), i +bignum_montifier_modloop: + xorq b, b + movq k, n + xorq j, j + xorq c, c +bignum_montifier_cmaloop: + adcq b, c + sbbq l, l + movq (z,j,8), a + mulq h + subq l, d + addq c, a + movq (t,j,8), b + movq a, (t,j,8) + movq d, c + incq j + decq n + jnz bignum_montifier_cmaloop + adcq c, b + movq b, h + + sbbq l, l + + xorq j, j + xorq c, c +bignum_montifier_oaloop: + movq (t,j,8), a + movq (z,j,8), b + andq l, b + negq c + adcq b, a + sbbq c, c + movq a, (t,j,8) + incq j + cmpq k, j + jc bignum_montifier_oaloop + subq c, h + + decq i + jnz bignum_montifier_modloop + +// Compute the negated modular inverse w (same register as i, not used again). + + movq (m), a + movq a, c + movq a, w + shlq $2, c + subq c, w + xorq $2, w + movq w, c + imulq a, c + movl $2, ashort + addq c, a + addq $1, c + imulq a, w + imulq c, c + movl $1, ashort + addq c, a + imulq a, w + imulq c, c + movl $1, ashort + addq c, a + imulq a, w + imulq c, c + movl $1, ashort + addq c, a + imulq a, w + +// Now do one almost-Montgomery reduction w.r.t. the original m +// which lops off one 2^64 from the congruence and, with the usual +// almost-Montgomery correction, gets us back inside k digits + + movq (t), c + movq w, b + imulq c, b + + movq (m), a + mulq b + addq c, a + movq d, c + movl $1, jshort + movq k, n + decq n + jz bignum_montifier_amontend +bignum_montifier_amontloop: + adcq (t,j,8), c + sbbq l, l + movq (m,j,8), a + mulq b + subq l, d + addq c, a + movq a, -8(t,j,8) + movq d, c + incq j + decq n + jnz bignum_montifier_amontloop +bignum_montifier_amontend: + adcq c, h + sbbq l, l + movq h, -8(t,k,8) + + xorq j, j + xorq c, c +bignum_montifier_aosloop: + movq (t,j,8), a + movq (m,j,8), b + andq l, b + negq c + sbbq b, a + sbbq c, c + movq a, (z,j,8) + incq j + cmpq k, j + jc bignum_montifier_aosloop + +// So far, the code (basically a variant of bignum_amontifier) has produced +// a k-digit value z == 2^{192k} (mod m), not necessarily fully reduced mod m. +// We now do a short Montgomery reduction (similar to bignum_demont) so that +// we achieve full reduction mod m while lopping 2^{64k} off the congruence. +// We recycle h as the somewhat strangely-named outer loop counter. + + movq k, h + +bignum_montifier_montouterloop: + movq (z), c + movq w, b + imulq c, b + movq (m), a + mulq b + addq c, a + movq d, c + movl $1, jshort + movq k, n + decq n + jz bignum_montifier_montend +bignum_montifier_montloop: + adcq (z,j,8), c + sbbq l, l + movq (m,j,8), a + mulq b + subq l, d + addq c, a + movq a, -8(z,j,8) + movq d, c + incq j + decq n + jnz bignum_montifier_montloop +bignum_montifier_montend: + adcq $0, c + movq c, -8(z,k,8) + + decq h + jnz bignum_montifier_montouterloop + +// Now do a comparison of z with m to set a final correction mask +// indicating that z >= m and so we need to subtract m. + + xorq j, j + movq k, n +bignum_montifier_cmploop: + movq (z,j,8), a + sbbq (m,j,8), a + incq j + decq n + jnz bignum_montifier_cmploop + sbbq d, d + notq d + +// Now do a masked subtraction of m for the final reduced result. + + xorq l, l + xorq j, j +bignum_montifier_corrloop: + movq (m,j,8), a + andq d, a + negq l + sbbq a, (z,j,8) + sbbq l, l + incq j + cmpq k, j + jc bignum_montifier_corrloop + +bignum_montifier_end: + popq %r13 + popq %r12 + popq %rbx + popq %rbp + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montmul.S new file mode 100644 index 00000000000..0a914f8f4fe --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montmul.S @@ -0,0 +1,260 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^{64k}) mod m +// Inputs x[k], y[k], m[k]; output z[k] +// +// extern void bignum_montmul +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y, uint64_t *m); +// +// Does z := (x * y / 2^{64k}) mod m, assuming x * y <= 2^{64k} * m, which is +// guaranteed in particular if x < m, y < m initially (the "intended" case). +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = y, R8 = m +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = y, [RSP+40] = m +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul) + .text + +// We copy x to %r9 but it comes in in %rdx originally + +#define k %rdi +#define z %rsi +#define x %r9 +#define y %rcx +#define m %r8 + +// General temp, low part of product and mul input +#define a %rax +// General temp, High part of product +#define b %rdx +// Inner loop counter +#define j %rbx +// Home for i'th digit or Montgomery multiplier +#define d %rbp +#define h %r10 +#define e %r11 +#define n %r12 +#define i %r13 +#define c0 %r14 +#define c1 %r15 + +// This one variable we store on the stack as we are a register short. +// At least it's only used once per iteration of the outer loop (k times) +// and with a single read each time, after one initial write. It's the +// word-level negated modular inverse. + +#define w (%rsp) + +// Some more intuitive names for temp regs in initial word-level negmodinv. + +#define t1 %rbx +#define t2 %rdx + +#define ashort %eax +#define jshort %ebx + + +S2N_BN_SYMBOL(bignum_montmul): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// Save registers and allocate space on stack for non-register variable w + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $8, %rsp + +// If k = 0 the whole operation is trivial + + testq k, k + jz bignum_montmul_end + +// Move x input into its permanent home, since we need %rdx for multiplications + + movq %rdx, x + +// Compute word-level negated modular inverse w for m == m[0]. + + movq (m), a + + movq a, t2 + movq a, t1 + shlq $2, t2 + subq t2, t1 + xorq $2, t1 + + movq t1, t2 + imulq a, t2 + movl $2, ashort + addq t2, a + addq $1, t2 + + imulq a, t1 + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, t1 + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, t1 + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, t1 + + movq t1, w + +// Initialize the output c0::z to zero so we can then consistently add rows. +// It would be a bit more efficient to special-case the zeroth row, but +// this keeps the code slightly simpler. + + xorq i, i // Also initializes i for main loop + xorq j, j +bignum_montmul_zoop: + movq i, (z,j,8) + incq j + cmpq k, j + jc bignum_montmul_zoop + + xorq c0, c0 + +// Outer loop pulling down digits d=x[i], multiplying by y and reducing + +bignum_montmul_outerloop: + +// Multiply-add loop where we always have CF + previous high part h to add in. +// Note that in general we do need yet one more carry in this phase and hence +// initialize c1 with the top carry. + + movq (x,i,8), d + xorq j, j + xorq h, h + xorq c1, c1 + movq k, n + +bignum_montmul_maddloop: + adcq (z,j,8), h + sbbq e, e + movq (y,j,8), a + mulq d + subq e, %rdx + addq h, a + movq a, (z,j,8) + movq %rdx, h + incq j + decq n + jnz bignum_montmul_maddloop + adcq h, c0 + adcq c1, c1 + +// Montgomery reduction loop, similar but offsetting writebacks + + movq (z), e + movq w, d + imulq e, d + movq (m), a + mulq d + addq e, a // Will be zero but want the carry + movq %rdx, h + movl $1, jshort + movq k, n + decq n + jz bignum_montmul_montend + +bignum_montmul_montloop: + adcq (z,j,8), h + sbbq e, e + movq (m,j,8), a + mulq d + subq e, %rdx + addq h, a + movq a, -8(z,j,8) + movq %rdx, h + incq j + decq n + jnz bignum_montmul_montloop + +bignum_montmul_montend: + adcq c0, h + adcq $0, c1 + movq c1, c0 + movq h, -8(z,j,8) + +// End of outer loop. + + incq i + cmpq k, i + jc bignum_montmul_outerloop + +// Now do a comparison of (c0::z) with (0::m) to set a final correction mask +// indicating that (c0::z) >= m and so we need to subtract m. + + xorq j, j + movq k, n +bignum_montmul_cmploop: + movq (z,j,8), a + sbbq (m,j,8), a + incq j + decq n + jnz bignum_montmul_cmploop + + sbbq $0, c0 + sbbq d, d + notq d + +// Now do a masked subtraction of m for the final reduced result. + + xorq e, e + xorq j, j +bignum_montmul_corrloop: + movq (m,j,8), a + andq d, a + negq e + sbbq a, (z,j,8) + sbbq e, e + incq j + cmpq k, j + jc bignum_montmul_corrloop + +bignum_montmul_end: + addq $8, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montredc.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montredc.S new file mode 100644 index 00000000000..c023023f89b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montredc.S @@ -0,0 +1,264 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery reduce, z := (x' / 2^{64p}) MOD m +// Inputs x[n], m[k], p; output z[k] +// +// extern void bignum_montredc +// (uint64_t k, uint64_t *z, +// uint64_t n, uint64_t *x, uint64_t *m, uint64_t p); +// +// Does a := (x' / 2^{64p}) mod m where x' = x if n <= p + k and in general +// is the lowest (p+k) digits of x, assuming x' <= 2^{64p} * m. That is, +// p-fold Montgomery reduction w.r.t. a k-digit modulus m giving a k-digit +// answer. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x, R8 = m, R9 = p +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x, [RSP+40] = m, [RSP+48] = p +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montredc) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montredc) + .text + + // We copy n into %r10 but it comes in in %rdx originally + +#define k %rdi +#define z %rsi +#define n %r10 +#define x %rcx +#define m %r8 +#define p %r9 + +// General temp, low part of product and mul input +#define a %rax +// General temp, High part of product +#define b %rdx +// Negated modular inverse +#define w (%rsp) +// Inner loop counter +#define j %rbx +// Home for i'th digit or Montgomery multiplier +#define d %rbp +#define h %r11 +#define e %r12 +#define t %r13 +#define i %r14 +#define c %r15 + +// Some more intuitive names for temp regs in initial word-level negmodinv. + +#define t1 %rbx +#define t2 %r14 + +#define ashort %eax +#define cshort %r15d +#define jshort %ebx + + +S2N_BN_SYMBOL(bignum_montredc): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 + movq 64(%rsp), %r9 +#endif + +// Save registers and allocate space on stack for non-register variable w + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $8, %rsp + +// If k = 0 the whole operation is trivial + + testq k, k + jz bignum_montredc_end + +// Move n input into its permanent home, since we need %rdx for multiplications + + movq %rdx, n + +// Compute word-level negated modular inverse w for m == m[0]. + + movq (m), a + + movq a, t2 + movq a, t1 + shlq $2, t2 + subq t2, t1 + xorq $2, t1 + + movq t1, t2 + imulq a, t2 + movl $2, ashort + addq t2, a + addq $1, t2 + + imulq a, t1 + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, t1 + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, t1 + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, t1 + + movq t1, w + +// Initialize z to the lowest k digits of the input, zero-padding if n < k. + + movq k, j + cmpq k, n + cmovcq n, j + xorq i, i + testq j, j + jz bignum_montredc_padloop +bignum_montredc_copyloop: + movq (x,i,8), a + movq a, (z,i,8) + incq i + cmpq j, i + jc bignum_montredc_copyloop + + cmpq k, i + jnc bignum_montredc_initialized + + xorq j, j +bignum_montredc_padloop: + movq j, (z,i,8) + incq i + cmpq k, i + jc bignum_montredc_padloop + +bignum_montredc_initialized: + xorq c, c + +// Now if p = 0 we just need the corrective tail, and even that is +// only needed for the case when the input is exactly the modulus, +// to maintain the <= 2^64p * n precondition + + testq p, p + jz bignum_montredc_corrective + +// Outer loop, just doing a standard Montgomery reduction on z + + xorq i, i +bignum_montredc_outerloop: + movq (z), e + movq w, d + imulq e, d + movq (m), a + mulq d + addq e, a // Will be zero but want the carry + movq %rdx, h + movl $1, jshort + movq k, t + decq t + jz bignum_montredc_montend + +bignum_montredc_montloop: + adcq (z,j,8), h + sbbq e, e + movq (m,j,8), a + mulq d + subq e, %rdx + addq h, a + movq a, -8(z,j,8) + movq %rdx, h + incq j + decq t + jnz bignum_montredc_montloop + +bignum_montredc_montend: + adcq c, h + movl $0, cshort + adcq $0, c + + addq i, j + cmpq n, j + jnc bignum_montredc_offtheend + movq (x,j,8), a + addq a, h + adcq $0, c +bignum_montredc_offtheend: + movq h, -8(z,k,8) + +// End of outer loop. + + incq i + cmpq p, i + jc bignum_montredc_outerloop + +// Now do a comparison of (c::z) with (0::m) to set a final correction mask +// indicating that (c::z) >= m and so we need to subtract m. + +bignum_montredc_corrective: + + xorq j, j + movq k, n +bignum_montredc_cmploop: + movq (z,j,8), a + sbbq (m,j,8), a + incq j + decq n + jnz bignum_montredc_cmploop + + sbbq $0, c + sbbq d, d + notq d + +// Now do a masked subtraction of m for the final reduced result. + + xorq e, e + xorq j, j +bignum_montredc_corrloop: + movq (m,j,8), a + andq d, a + negq e + sbbq a, (z,j,8) + sbbq e, e + incq j + cmpq k, j + jc bignum_montredc_corrloop + +bignum_montredc_end: + addq $8, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montsqr.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montsqr.S new file mode 100644 index 00000000000..f028239dd3d --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_montsqr.S @@ -0,0 +1,248 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^{64k}) mod m +// Inputs x[k], m[k]; output z[k] +// +// extern void bignum_montsqr +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t *m); +// +// Does z := (x^2 / 2^{64k}) mod m, assuming x^2 <= 2^{64k} * m, which is +// guaranteed in particular if x < m initially (the "intended" case). +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = m +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = m +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr) + .text + +// We copy x into %r9 but it comes in in %rdx originally + +#define k %rdi +#define z %rsi +#define x %r9 +#define m %rcx + +// General temp, low part of product and mul input +#define a %rax +// General temp, High part of product +#define b %rdx +// Negated modular inverse +#define w %r8 +// Inner loop counter +#define j %rbx +// Home for i'th digit or Montgomery multiplier +#define d %rbp +#define h %r10 +#define e %r11 +#define n %r12 +#define i %r13 +#define c0 %r14 +#define c1 %r15 + +// A temp reg in the initial word-level negmodinv. + +#define t2 %rdx + +#define ashort %eax +#define jshort %ebx + + +S2N_BN_SYMBOL(bignum_montsqr): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Save registers + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// If k = 0 the whole operation is trivial + + testq k, k + jz bignum_montsqr_end + +// Move x input into its permanent home, since we need %rdx for multiplications + + movq %rdx, x + +// Compute word-level negated modular inverse w for m == m[0]. + + movq (m), a + + movq a, t2 + movq a, w + shlq $2, t2 + subq t2, w + xorq $2, w + + movq w, t2 + imulq a, t2 + movl $2, ashort + addq t2, a + addq $1, t2 + + imulq a, w + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, w + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, w + + imulq t2, t2 + movl $1, ashort + addq t2, a + imulq a, w + +// Initialize the output c0::z to zero so we can then consistently add rows. +// It would be a bit more efficient to special-case the zeroth row, but +// this keeps the code slightly simpler. + + xorq i, i // Also initializes i for main loop + xorq j, j +bignum_montsqr_zoop: + movq i, (z,j,8) + incq j + cmpq k, j + jc bignum_montsqr_zoop + + xorq c0, c0 + +// Outer loop pulling down digits d=x[i], multiplying by x and reducing + +bignum_montsqr_outerloop: + +// Multiply-add loop where we always have CF + previous high part h to add in. +// Note that in general we do need yet one more carry in this phase and hence +// initialize c1 with the top carry. + + movq (x,i,8), d + xorq j, j + xorq h, h + xorq c1, c1 + movq k, n + +bignum_montsqr_maddloop: + adcq (z,j,8), h + sbbq e, e + movq (x,j,8), a + mulq d + subq e, %rdx + addq h, a + movq a, (z,j,8) + movq %rdx, h + incq j + decq n + jnz bignum_montsqr_maddloop + adcq h, c0 + adcq c1, c1 + +// Montgomery reduction loop, similar but offsetting writebacks + + movq (z), e + movq w, d + imulq e, d + movq (m), a + mulq d + addq e, a // Will be zero but want the carry + movq %rdx, h + movl $1, jshort + movq k, n + decq n + jz bignum_montsqr_montend + +bignum_montsqr_montloop: + adcq (z,j,8), h + sbbq e, e + movq (m,j,8), a + mulq d + subq e, %rdx + addq h, a + movq a, -8(z,j,8) + movq %rdx, h + incq j + decq n + jnz bignum_montsqr_montloop + +bignum_montsqr_montend: + adcq c0, h + adcq $0, c1 + movq c1, c0 + movq h, -8(z,j,8) + +// End of outer loop. + + incq i + cmpq k, i + jc bignum_montsqr_outerloop + +// Now do a comparison of (c0::z) with (0::m) to set a final correction mask +// indicating that (c0::z) >= m and so we need to subtract m. + + xorq j, j + movq k, n +bignum_montsqr_cmploop: + movq (z,j,8), a + sbbq (m,j,8), a + incq j + decq n + jnz bignum_montsqr_cmploop + + sbbq $0, c0 + sbbq d, d + notq d + +// Now do a masked subtraction of m for the final reduced result. + + xorq e, e + xorq j, j +bignum_montsqr_corrloop: + movq (m,j,8), a + andq d, a + negq e + sbbq a, (z,j,8) + sbbq e, e + incq j + cmpq k, j + jc bignum_montsqr_corrloop + +bignum_montsqr_end: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mul.S new file mode 100644 index 00000000000..060064a4c7d --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mul.S @@ -0,0 +1,156 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply z := x * y +// Inputs x[m], y[n]; output z[k] +// +// extern void bignum_mul +// (uint64_t k, uint64_t *z, +// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Does the "z := x * y" operation where x is m digits, y is n, result z is k. +// Truncates the result in general unless k >= m + n +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul) + .text + +// These are actually right + +#define p %rdi +#define z %rsi +#define n %r8 + +// These are not + +#define c %r15 +#define h %r14 +#define l %r13 +#define x %r12 +#define y %r11 +#define i %rbx +#define k %r10 +#define m %rbp + +// These are always local scratch since multiplier result is in these + +#define a %rax +#define d %rdx + + + +S2N_BN_SYMBOL(bignum_mul): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 + movq 64(%rsp), %r9 +#endif + +// We use too many registers, and also we need %rax:%rdx for multiplications + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx, m + +// If the result size is zero, do nothing +// Note that even if either or both inputs has size zero, we can't +// just give up because we at least need to zero the output array +// If we did a multiply-add variant, however, then we could + + testq p, p + jz bignum_mul_end + +// Set initial 2-part sum to zero (we zero c inside the body) + + xorq h, h + xorq l, l + +// Otherwise do outer loop k = 0 ... k = p - 1 + + xorq k, k + +bignum_mul_outerloop: + +// Zero our carry term first; we eventually want it and a zero is useful now +// Set a = max 0 (k + 1 - n), i = min (k + 1) m +// This defines the range a <= j < i for the inner summation +// Note that since k < p < 2^64 we can assume k + 1 doesn't overflow +// And since we want to increment it anyway, we might as well do it now + + xorq c, c // c = 0 + incq k // k = k + 1 + + movq k, a // a = k + 1 + subq n, a // a = k + 1 - n + cmovcq c, a // a = max 0 (k + 1 - n) + + movq m, i // i = m + cmpq m, k // CF <=> k + 1 < m + cmovcq k, i // i = min (k + 1) m + +// Turn i into a loop count, and skip things if it's <= 0 +// Otherwise set up initial pointers x -> x0[a] and y -> y0[k - a] +// and then launch into the main inner loop, postdecrementing i + + movq k, d + subq i, d + subq a, i + jbe bignum_mul_innerend + leaq (%rcx,a,8), x + leaq -8(%r9,d,8), y + +bignum_mul_innerloop: + movq (y,i,8), %rax + mulq (x) + addq $8, x + addq %rax, l + adcq %rdx, h + adcq $0, c + decq i + jnz bignum_mul_innerloop + +bignum_mul_innerend: + + movq l, (z) + movq h, l + movq c, h + addq $8, z + + cmpq p, k + jc bignum_mul_outerloop + +bignum_mul_end: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_muladd10.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_muladd10.S new file mode 100644 index 00000000000..2215714cc31 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_muladd10.S @@ -0,0 +1,79 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply bignum by 10 and add word: z := 10 * z + d +// Inputs z[k], d; outputs function return (carry) and z[k] +// +// extern uint64_t bignum_muladd10 (uint64_t k, uint64_t *z, uint64_t d); +// +// Although typically the input d < 10, this is not actually required. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = d, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = d, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_muladd10) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_muladd10) + .text + +#define k %rdi +#define z %rsi +#define d %rcx + +#define a %rax +#define l %rax + +#define h %rdx +#define i %r8 +#define ten %r9 +#define tenshort %r9d + +S2N_BN_SYMBOL(bignum_muladd10): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Move carry input to permanent home, and if k = 0 skip the main loop + + movq %rdx, d + testq k, k + jz bignum_muladd10_end + +// Simple loop + + xorq i, i + movl $10, tenshort +bignum_muladd10_loop: + movq (z,i,8), a + mulq ten + addq d, l + movq l, (z,i,8) + adcq $0, h + movq h, d + incq i + cmpq k, i + jc bignum_muladd10_loop + +// Return the final carry + +bignum_muladd10_end: + movq d, %rax +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mux.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mux.S new file mode 100644 index 00000000000..5ec5435ee14 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mux.S @@ -0,0 +1,68 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiplex/select z := x (if p nonzero) or z := y (if p zero) +// Inputs p, x[k], y[k]; output z[k] +// +// extern void bignum_mux +// (uint64_t p, uint64_t k, uint64_t *z, uint64_t *x, uint64_t *y); +// +// It is assumed that all numbers x, y and z have the same size k digits. +// +// Standard x86-64 ABI: RDI = p, RSI = k, RDX = z, RCX = x, R8 = y +// Microsoft x64 ABI: RCX = p, RDX = k, R8 = z, R9 = x, [RSP+40] = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mux) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mux) + .text + +#define b %rdi +#define k %rsi +#define z %rdx +#define x %rcx +#define y %r8 +#define i %r9 +#define a %rax + + + +S2N_BN_SYMBOL(bignum_mux): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + testq k, k + jz bignum_mux_end // If length = 0 do nothing + + xorq i, i + negq b // CF <=> (b != 0) +bignum_mux_loop: + movq (x,i,8), a + movq (y,i,8), b + cmovncq b, a // CF ? a : b + movq a, (z,i,8) + incq i + decq k + jnz bignum_mux_loop +bignum_mux_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mux16.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mux16.S new file mode 100644 index 00000000000..c04a0f44eaa --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_mux16.S @@ -0,0 +1,93 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Select element from 16-element table, z := xs[k*i] +// Inputs xs[16*k], i; output z[k] +// +// extern void bignum_mux16 +// (uint64_t k, uint64_t *z, uint64_t *xs, uint64_t i); +// +// It is assumed that all numbers xs[16] and the target z have the same size k +// The pointer xs is to a contiguous array of size 16, elements size-k bignums +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = xs, RCX = i +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = xs, R9 = i +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mux16) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mux16) + .text + +#define k %rdi +#define z %rsi + +// These get moved from original registers + +#define x %rcx +#define i %rax + +// Other registers + +#define a %rdx +#define b %r8 +#define j %r9 +#define n %r10 + + + +S2N_BN_SYMBOL(bignum_mux16): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + + +// Copy size into decrementable counter, or skip everything if k = 0 + + testq k, k + jz bignum_mux16_end // If length = 0 do nothing + movq k, n + +// Multiply i by k so we can compare pointer offsets directly with it + + movq %rcx, %rax + movq %rdx, %rcx + mulq k + +// Main loop + +bignum_mux16_loop: + movq (x), a + movq k, j +.rep 15 + movq (x,j,8), b + cmpq i, j + cmoveq b, a + addq k, j +.endr + movq a, (z) + addq $8, z + addq $8, x + decq n + jnz bignum_mux16_loop + +bignum_mux16_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_negmodinv.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_negmodinv.S new file mode 100644 index 00000000000..203a7ba5fbd --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_negmodinv.S @@ -0,0 +1,186 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Negated modular inverse, z := (-1/x) mod 2^{64k} +// Input x[k]; output z[k] +// +// extern void bignum_negmodinv +// (uint64_t k, uint64_t *z, uint64_t *x); +// +// Assuming x is odd (otherwise nothing makes sense) the result satisfies +// +// x * z + 1 == 0 (mod 2^{64 * k}) +// +// but is not necessarily reduced mod x. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_negmodinv) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_negmodinv) + .text + +#define k %rdi +#define z %rsi +// Moved from initial location to free %rdx +#define x %rcx + +#define a %rax +#define d %rdx +#define i %r8 +#define m %r9 +#define h %r10 +#define w %r11 +#define t %r12 +#define e %rbx + +#define ashort %eax +#define ishort %r8d + +S2N_BN_SYMBOL(bignum_negmodinv): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + + pushq %rbx + pushq %r12 + +// If k = 0 do nothing (actually we could have avoiding the pushes and pops) + + testq k, k + jz bignum_negmodinv_end + +// Move the x pointer into its permanent home (%rdx is needed for muls) + + movq %rdx, x + +// Compute word-level negated modular inverse w for x[0]. + + movq (x), a + + movq a, d + movq a, w + shlq $2, d + subq d, w + xorq $2, w + + movq w, d + imulq a, d + movl $2, ashort + addq d, a + addq $1, d + + imulq a, w + + imulq d, d + movl $1, ashort + addq d, a + imulq a, w + + imulq d, d + movl $1, ashort + addq d, a + imulq a, w + + imulq d, d + movl $1, ashort + addq d, a + imulq a, w + +// Write that as lowest word of the output, then if k = 1 we're finished + + movq w, (z) + cmpq $1, k + jz bignum_negmodinv_end + +// Otherwise compute and write the other digits (1..k-1) of w * x + 1 + + movq (x), a + xorq h, h + mulq w + addq $1, a + adcq d, h + movl $1, ishort +bignum_negmodinv_initloop: + movq (x,i,8), a + mulq w + addq h, a + adcq $0, d + movq a, (z,i,8) + movq d, h + incq i + cmpq k, i + jc bignum_negmodinv_initloop + +// For simpler indexing, z := z + 8 and k := k - 1 per outer iteration +// Then we can use the same index for x and for z and effective size k. +// +// But we also offset k by 1 so the "real" size is k + 1; after doing +// the special zeroth bit we count with t through k more digits, so +// getting k + 1 total as required. +// +// This lets us avoid some special cases inside the loop at the cost +// of needing the additional "finale" tail for the final iteration +// since we do one outer loop iteration too few. + + subq $2, k + jz bignum_negmodinv_finale + +bignum_negmodinv_outerloop: + addq $8, z + + movq (z), h + movq w, m + imulq h, m + movq m, (z) + movq (x), a + mulq m + addq h, a + adcq $0, d + movq d, h + movl $1, ishort + movq k, t + bignum_negmodinv_innerloop: + adcq (z,i,8), h + sbbq e, e + movq (x,i,8), a + mulq m + subq e, d + addq h, a + movq a, (z,i,8) + movq d, h + incq i + decq t + jnz bignum_negmodinv_innerloop + + decq k + jnz bignum_negmodinv_outerloop + +bignum_negmodinv_finale: + movq 8(z), a + imulq w, a + movq a, 8(z) + +bignum_negmodinv_end: + popq %r12 + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_nonzero.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_nonzero.S new file mode 100644 index 00000000000..2717367f5c9 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_nonzero.S @@ -0,0 +1,58 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Test bignum for nonzero-ness x =/= 0 +// Input x[k]; output function return +// +// extern uint64_t bignum_nonzero (uint64_t k, uint64_t *x); +// +// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_nonzero) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_nonzero) + .text + +#define a %rax +#define k %rdi +#define x %rsi + +S2N_BN_SYMBOL(bignum_nonzero): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + xorq a, a + testq k, k + jz bignum_nonzero_end + +bignum_nonzero_loop: + orq -8(x,k,8), a + decq k + jnz bignum_nonzero_loop + +// Set a standard C condition based on whether a is nonzero + + negq a + sbbq a, a + negq a + +bignum_nonzero_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_normalize.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_normalize.S new file mode 100644 index 00000000000..1056ed8305e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_normalize.S @@ -0,0 +1,124 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Normalize bignum in-place by shifting left till top bit is 1 +// Input z[k]; outputs function return (bits shifted left) and z[k] +// +// extern uint64_t bignum_normalize (uint64_t k, uint64_t *z); +// +// Given a k-digit bignum z, this function shifts it left by its number of +// leading zero bits, to give result with top bit 1, unless the input number +// was 0. The return is the same as the output of bignum_clz, i.e. the number +// of bits shifted (nominally 64 * k in the case of zero input). +// +// Standard x86-64 ABI: RDI = k, RSI = z, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_normalize) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_normalize) + .text + +#define k %rdi +#define z %rsi + +// Return value, which we put in %rax to save a move or two + +#define r %rax + +// Other variables +// Matters that c is RCX as CL=lo(c) is assumed in shifts + +#define b %r9 +#define c %rcx +#define d %rdx +#define i %r8 +#define j %r10 + +#define dshort %edx + + +S2N_BN_SYMBOL(bignum_normalize): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Initialize shift count r = 0 and i = k - 1 but return immediately if k = 0. +// Otherwise load top digit c, but then if k = 1 skip the digitwise part + + xorq r, r + movq k, i + subq $1, i + jc bignum_normalize_end + movq (z,i,8), c + jz bignum_normalize_bitpart + +// Do d rather stupid but constant-time digit normalization, conditionally +// shifting left (k-1) times based on whether the top word is zero. +// With careful binary striding this could be O(k*log(k)) instead of O(k^2) +// while still retaining the constant-time style. + +bignum_normalize_normloop: + xorq j, j + movq k, b + movq r, d + incq r + negq c + cmovneq d, r + movl $0, dshort +bignum_normalize_shufloop: + movq d, c + movq (z,j,8), d + cmovcq d, c + movq c, (z,j,8) + incq j + decq b + jnz bignum_normalize_shufloop + decq i + jnz bignum_normalize_normloop + +// We now have the top digit nonzero, assuming the input was nonzero, +// and as per the invariant of the loop above, c holds that digit. So +// now just count c's leading zeros and shift z bitwise that many bits. +// We need to patch the bsr result for the undefined case of zero input + +bignum_normalize_bitpart: + movl $127, dshort + bsrq c, c + cmovzq d, c + xorq $63, c + + shlq $6, r + addq c, r + + xorq b, b + xorq i, i +bignum_normalize_bitloop: + movq (z,i,8), d + movq d, j + shldq %cl, b, d + movq d, (z,i,8) + movq j, b + incq i + cmpq k, i + jc bignum_normalize_bitloop + + bignum_normalize_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_odd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_odd.S new file mode 100644 index 00000000000..81caa3a8bde --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_odd.S @@ -0,0 +1,52 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Test bignum for odd-ness +// Input x[k]; output function return +// +// extern uint64_t bignum_odd (uint64_t k, uint64_t *x); +// +// Standard x86-64 ABI: RDI = k, RSI = x, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = x, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_odd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_odd) + .text + +S2N_BN_SYMBOL(bignum_odd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Set default return value of 0 and finish if k = 0 (trivially not odd) + + xorl %eax, %eax + testq %rdi, %rdi + jz bignum_odd_end + +// Otherwise return lowest bit of the input + + movl $1, %eax + andq (%rsi), %rax + +bignum_odd_end: + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_of_word.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_of_word.S new file mode 100644 index 00000000000..cb7c794979b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_of_word.S @@ -0,0 +1,69 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert single digit to bignum, z := n +// Input n; output z[k] +// +// extern void bignum_of_word (uint64_t k, uint64_t *z, uint64_t n); +// +// Create a k-digit (digit=64 bits) bignum at z with value n (mod 2^k) +// where n is a word. The "mod 2^k" only matters in the degenerate k = 0 case. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_of_word) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_of_word) + .text + +S2N_BN_SYMBOL(bignum_of_word): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// If k = 0 do nothing + + testq %rdi, %rdi + jz bignum_of_word_end + +bignum_of_word_nontrivial: + +// Write lowest word and jump to end if k = 1 + + movq %rdx, (%rsi) + decq %rdi + jz bignum_of_word_end + +// Zero %rdx and write it to all z[i] for i = k-1 down to 1 +// It's a bit more compact to iterate "high to low" like this. +// But at the cost of bumping up %rsi by lea %rsi, [%rsi+8] +// each time round the loop (which also modifies one more reg) +// we could go "low to high" if it helps with prefetch etc. + + xorq %rdx, %rdx +bignum_of_word_loop: + movq %rdx, (%rsi,%rdi,8) + decq %rdi + jnz bignum_of_word_loop + +bignum_of_word_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optadd.S new file mode 100644 index 00000000000..90aa07e0a44 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optadd.S @@ -0,0 +1,92 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally add, z := x + y (if p nonzero) or z := x (if p zero) +// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_optadd +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y); +// +// It is assumed that all numbers x, y and z have the same size k digits. +// Returns carry-out as per usual addition, always 0 if p was zero. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = p, R8 = y, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = p, [RSP+40] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optadd) + .text + +#define k %rdi +#define z %rsi +#define x %rdx +#define p %rcx +#define y %r8 + +#define c %rax +#define i %r9 +#define b %r10 +#define a %r11 + + +S2N_BN_SYMBOL(bignum_optadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// Initialize top carry to zero in all cases (also return value) + + xorq c, c + +// If k = 0 do nothing + + testq k, k + jz bignum_optadd_end + +// Convert the nonzero/zero status of p into an all-1s or all-0s mask + + negq p + sbbq p, p + +// Now go round the loop for i=0...k-1, saving the carry in c each iteration + + xorq i, i +bignum_optadd_loop: + movq (x,i,8), a + movq (y,i,8), b + andq p, b + negq c + adcq b, a + sbbq c, c + movq a, (z,i,8) + incq i + cmpq k, i + jc bignum_optadd_loop + +// Return top carry + + negq %rax + +bignum_optadd_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optneg.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optneg.S new file mode 100644 index 00000000000..288c887f028 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optneg.S @@ -0,0 +1,89 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally negate, z := -x (if p nonzero) or z := x (if p zero) +// Inputs p, x[k]; outputs function return (nonzero input) and z[k] +// +// extern uint64_t bignum_optneg +// (uint64_t k, uint64_t *z, uint64_t p, uint64_t *x); +// +// It is assumed that both numbers x and z have the same size k digits. +// Returns a carry, which is equivalent to "x is nonzero". +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = p, RCX = x, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = p, R9 = x, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg) + .text + +#define k %rdi +#define z %rsi +#define p %rdx +#define x %rcx + +#define c %rax +#define a %r8 +#define i %r9 + +#define cshort %eax + +S2N_BN_SYMBOL(bignum_optneg): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// If k = 0 do nothing, but need to set zero return for the carry (c = %rax) + + xorq c, c + testq k, k + jz bignum_optneg_end + +// Convert p into a strict bitmask and set initial carry-in in c + + negq p + sbbq p, p + subq p, c + +// Main loop + + xorq i, i +bignum_optneg_loop: + + movq (x,i,8), a + xorq p, a + addq c, a + movl $0, cshort + movq a, (z,i,8) + adcq $0, c + incq i + cmpq k, i + jc bignum_optneg_loop + +// Return carry flag, fixing up inversion for negative case + + xorq p, %rax + andq $1, %rax + +bignum_optneg_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optsub.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optsub.S new file mode 100644 index 00000000000..29a716b7bb0 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optsub.S @@ -0,0 +1,92 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero) +// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_optsub +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y); +// +// It is assumed that all numbers x, y and z have the same size k digits. +// Returns carry-out as per usual subtraction, always 0 if p was zero. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = p, R8 = y, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = p, [RSP+40] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optsub) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optsub) + .text + +#define k %rdi +#define z %rsi +#define x %rdx +#define p %rcx +#define y %r8 + +#define i %r9 +#define b %r10 +#define c %rax +#define a %r11 + + +S2N_BN_SYMBOL(bignum_optsub): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// Initialize top carry to zero in all cases (also return value) + + xorq c, c + +// If k = 0 do nothing + + testq k, k + jz bignum_optsub_end + +// Convert the nonzero/zero status of p into an all-1s or all-0s mask + + negq p + sbbq p, p + +// Now go round the loop for i=0...k-1, saving the carry in c each iteration + + xorq i, i +bignum_optsub_loop: + movq (x,i,8), a + movq (y,i,8), b + andq p, b + negq c + sbbq b, a + sbbq c, c + movq a, (z,i,8) + incq i + cmpq k, i + jc bignum_optsub_loop + +// Return top carry + + negq %rax + +bignum_optsub_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optsubadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optsubadd.S new file mode 100644 index 00000000000..051886d3070 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_optsubadd.S @@ -0,0 +1,109 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally subtract or add, z := x + sgn(p) * y interpreting p as signed +// Inputs x[k], p, y[k]; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_optsubadd +// (uint64_t k, uint64_t *z, uint64_t *x, uint64_t p, uint64_t *y); +// +// If p has top bit set (i.e. is negative as a signed int) return z := x - y +// Else if p is nonzero (i.e. is positive as a signed int) return z := x + y +// Otherwise (i.e. p is zero) return z := x +// +// Return in RDI = the top carry, which will be 0 or 1, and appropriate for +// addition or subtraction respectively (and always zero for p = 0) +// +// 2^{64*k} * -carryout + z = x - y [for subtraction] +// 2^{64*k} * carryout + z = x + y [for addition] +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = x, RCX = p, R8 = y, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = x, R9 = p, [RSP+40] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optsubadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optsubadd) + .text + +#define k %rdi +#define z %rsi +#define x %rdx +#define p %rcx +#define y %r8 + +#define c %rax +#define i %r9 +#define m %rcx +#define q %r10 +#define a %r11 + + +S2N_BN_SYMBOL(bignum_optsubadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// Initialize top carry to zero in all cases (also return value) + + xorq c, c + +// If k = 0 do nothing + + testq k, k + jz bignum_optsubadd_end + +// Turn the input p into two bitmasks, m indicating to use the y input at +// all (same register as p) and q indicating a sign-flip + + movq p, q + sarq $63, q + negq p + sbbq m, m + +// Generate an initial carry-in for the negating case only to add 1; this +// is because we are actually going to do complements of the words of y + + movq q, c + +// Now go round the loop for i=0...k-1, saving the carry in c each iteration + + xorq i, i +bignum_optsubadd_loop: + movq (y,i,8), a + xorq q, a + andq m, a + negq c + adcq (x,i,8), a + sbbq c, c + movq a, (z,i,8) + incq i + cmpq k, i + jc bignum_optsubadd_loop + +// Return carry flag, fixing up inversion for negative case + + xorq q, %rax + negq %rax + +bignum_optsubadd_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_pow2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_pow2.S new file mode 100644 index 00000000000..0e0b0206b95 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_pow2.S @@ -0,0 +1,81 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Return bignum of power of 2, z := 2^n +// Input n; output z[k] +// +// extern void bignum_pow2 (uint64_t k, uint64_t *z, uint64_t n); +// +// The result is as usual mod 2^{64*k}, so will be zero if n >= 64*k. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_pow2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_pow2) + .text + +#define k %rdi +#define z %rsi +#define n %rdx + +#define i %rcx +#define w %rax +#define a %r8 + +#define wshort %eax + + + +S2N_BN_SYMBOL(bignum_pow2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// If k = 0 do nothing + + testq k, k + jz bignum_pow2_end + +// Create the index n at which to write the nonzero word and the word w itself +// Note that the x86 manual explicitly says that shift counts are taken modulo +// the datasize, so we don't need to mask the lower 6 bits of n ourselves. + + movl $1, wshort + movq n, %rcx + shlq %cl, w + shrq $6, n + +// Now in a constant-time fashion set the n'th word to w and others to zero + + xorq i, i +bignum_pow2_loop: + xorq a, a + cmpq n, i + cmovzq w, a + movq a, (z,i,8) + incq i + cmpq k, i + jc bignum_pow2_loop + +bignum_pow2_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_shl_small.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_shl_small.S new file mode 100644 index 00000000000..f2170a56d03 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_shl_small.S @@ -0,0 +1,124 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Shift bignum left by c < 64 bits z := x * 2^c +// Inputs x[n], c; outputs function return (carry-out) and z[k] +// +// extern uint64_t bignum_shl_small +// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c); +// +// Does the "z := x << c" operation where x is n digits, result z is p. +// The shift count c is masked to 6 bits so it actually uses c' = c mod 64. +// The return value is the "next word" of a p+1 bit result, if n <= p. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x, R8 = c, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x, [RSP+40] = c, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_shl_small) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_shl_small) + .text + +#define p %rdi +#define z %rsi +#define n %rdx + +// These get moved from their initial positions + +#define c %rcx +#define x %r9 + +// Other variables + +#define b %rax +#define t %r8 +#define a %r10 +#define i %r11 + + + +S2N_BN_SYMBOL(bignum_shl_small): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// First clamp the input size n := min(p,n) since we can never need to read +// past the p'th term of the input to generate p-digit output. + + cmpq n, p + cmovcq p, n + +// Initialize "previous word" carry b to zero and main index i also to zero. +// Then just skip the main loop if n = 0 + + xorq b, b + xorq i, i + + testq n, n + jz bignum_shl_small_tail + +// Reshuffle registers to put the shift count into CL + + movq %rcx, x + movq %r8, c + +// Now the main loop + +bignum_shl_small_loop: + movq (x,i,8), a + movq a, t + shldq %cl, b, a + movq a, (z,i,8) + movq t, b + incq i + cmpq n, i + jc bignum_shl_small_loop + +// Shift the top word correspondingly. Using shld one more time is easier +// than carefully producing a complementary shift with care over the zero case + + xorq a, a + shldq %cl, b, a + movq a, b + +// If we are at the end, finish, otherwise write carry word then zeros + +bignum_shl_small_tail: + cmpq p, i + jnc bignum_shl_small_end + movq b, (z,i,8) + xorq b, b + incq i + cmpq p, i + jnc bignum_shl_small_end + +bignum_shl_small_tloop: + movq b, (z,i,8) + incq i + cmpq p, i + jc bignum_shl_small_tloop + +// Return, with RAX = b as the top word + +bignum_shl_small_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_shr_small.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_shr_small.S new file mode 100644 index 00000000000..8224c650c19 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_shr_small.S @@ -0,0 +1,114 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Shift bignum right by c < 64 bits z := floor(x / 2^c) +// Inputs x[n], c; outputs function return (bits shifted out) and z[k] +// +// extern uint64_t bignum_shr_small +// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x, uint64_t c); +// +// Does the "z := x >> c" operation where x is n digits, result z is p. +// The shift count c is masked to 6 bits so it actually uses c' = c mod 64. +// The return value is the inout mod 2^c'. +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x, R8 = c, returns RAX +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x, [RSP+40] = c, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_shr_small) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_shr_small) + .text + +#define p %rdi +#define z %rsi +#define n %rdx + +// These get moved from their initial positions + +#define c %rcx +#define x %r9 + +// Other variables + +#define b %rax +#define t %r8 +#define a %r10 + +#define ashort %r10d + + + +S2N_BN_SYMBOL(bignum_shr_small): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 +#endif + +// Reshuffle registers to put the shift count into CL + + movq %rcx, x + movq %r8, c + +// Set default carry-in word to 0, useful for other things too + + xorq b, b + +// First, if p > n then pad output on the left with p-n zeros + + cmpq p, n + jnc bignum_shr_small_nopad +bignum_shr_small_padloop: + decq p + movq b, (z,p,8) + cmpq p, n + jc bignum_shr_small_padloop +bignum_shr_small_nopad: + +// We now know that p <= n. If in fact p < n let carry word = x[p] instead of 0 + + jz bignum_shr_small_shiftstart + movq (x,p,8), b +bignum_shr_small_shiftstart: + testq p, p + jz bignum_shr_small_trivial + +// Now the main loop + +bignum_shr_small_loop: + movq -8(x,p,8), a + movq a, t + shrdq %cl, b, a + movq a, -8(z,p,8) + movq t, b + decq p + jnz bignum_shr_small_loop + +// Mask the carry word and return with that as RAX = b + +bignum_shr_small_trivial: + movl $1, ashort + shlq %cl, a + decq a + andq a, b + +bignum_shr_small_end: +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_sqr.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_sqr.S new file mode 100644 index 00000000000..916f22a8a4b --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_sqr.S @@ -0,0 +1,186 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square z := x^2 +// Input x[n]; output z[k] +// +// extern void bignum_sqr +// (uint64_t k, uint64_t *z, uint64_t n, uint64_t *x); +// +// Does the "z := x^2" operation where x is n digits and result z is k. +// Truncates the result in general unless k >= 2 * n +// +// Standard x86-64 ABI: RDI = k, RSI = z, RDX = n, RCX = x +// Microsoft x64 ABI: RCX = k, RDX = z, R8 = n, R9 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr) + .text + +// First three are where arguments come in, but n is moved. + +#define p %rdi +#define z %rsi +#define x %rcx +#define n %r8 + +// These are always local scratch since multiplier result is in these + +#define a %rax +#define d %rdx + +// Other variables + +#define i %rbx +#define ll %rbp +#define hh %r9 +#define k %r10 +#define y %r11 +#define htop %r12 +#define l %r13 +#define h %r14 +#define c %r15 + +// Short versions + +#define llshort %ebp + +S2N_BN_SYMBOL(bignum_sqr): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// We use too many registers, and also we need %rax:%rdx for multiplications + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx, n + +// If p = 0 the result is trivial and nothing needs doing + + testq p, p + jz bignum_sqr_end + +// initialize (hh,ll) = 0 + + xorl llshort, llshort + xorq hh, hh + +// Iterate outer loop from k = 0 ... k = p - 1 producing result digits + + xorq k, k + +bignum_sqr_outerloop: + +// First let bot = MAX 0 (k + 1 - n) and top = MIN (k + 1) n +// We want to accumulate all x[i] * x[k - i] for bot <= i < top +// For the optimization of squaring we avoid duplication and do +// 2 * x[i] * x[k - i] for i < htop, where htop = MIN ((k+1)/2) n +// Initialize i = bot; in fact just compute bot as i directly. + + xorq c, c + leaq 1(k), i + movq i, htop + shrq $1, htop + subq n, i + cmovcq c, i + cmpq n, htop + cmovncq n, htop + +// Initialize the three-part local sum (c,h,l); c was already done above + + xorq l, l + xorq h, h + +// If htop <= bot then main doubled part of the sum is empty + + cmpq htop, i + jnc bignum_sqr_nosumming + +// Use a moving pointer for [y] = x[k-i] for the cofactor + + movq k, a + subq i, a + leaq (x,a,8), y + +// Do the main part of the sum x[i] * x[k - i] for 2 * i < k + +bignum_sqr_innerloop: + movq (x,i,8), a + mulq (y) + addq a, l + adcq d, h + adcq $0, c + subq $8, y + incq i + cmpq htop, i + jc bignum_sqr_innerloop + +// Now double it + + addq l, l + adcq h, h + adcq c, c + +// If k is even (which means 2 * i = k) and i < n add the extra x[i]^2 term + +bignum_sqr_nosumming: + testq $1, k + jnz bignum_sqr_innerend + cmpq n, i + jnc bignum_sqr_innerend + + movq (x,i,8), a + mulq a + addq a, l + adcq d, h + adcq $0, c + +// Now add the local sum into the global sum, store and shift + +bignum_sqr_innerend: + addq ll, l + movq l, (z,k,8) + adcq hh, h + movq h, ll + adcq $0, c + movq c, hh + + incq k + cmpq p, k + jc bignum_sqr_outerloop + +// Restore registers and return + +bignum_sqr_end: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_sub.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_sub.S new file mode 100644 index 00000000000..589b89500e2 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/bignum_sub.S @@ -0,0 +1,142 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Subtract, z := x - y +// Inputs x[m], y[n]; outputs function return (carry-out) and z[p] +// +// extern uint64_t bignum_sub +// (uint64_t p, uint64_t *z, +// uint64_t m, uint64_t *x, uint64_t n, uint64_t *y); +// +// Does the z := x - y operation, truncating modulo p words in general and +// returning a top borrow (0 or 1) in the p'th place, only subtracting input +// words below p (as well as m and n respectively) to get the diff and borrow. +// +// Standard x86-64 ABI: RDI = p, RSI = z, RDX = m, RCX = x, R8 = n, R9 = y, returns RAX +// Microsoft x64 ABI: RCX = p, RDX = z, R8 = m, R9 = x, [RSP+40] = n, [RSP+48] = y, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub) + .text + +#define p %rdi +#define z %rsi +#define m %rdx +#define x %rcx +#define n %r8 +#define y %r9 +#define i %r10 +#define a %rax + +#define ashort %eax + + + +S2N_BN_SYMBOL(bignum_sub): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + movq 56(%rsp), %r8 + movq 64(%rsp), %r9 +#endif + +// Zero the main index counter for both branches + + xorq i, i + +// First clamp the two input sizes m := min(p,m) and n := min(p,n) since +// we'll never need words past the p'th. Can now assume m <= p and n <= p. +// Then compare the modified m and n and branch accordingly + + cmpq m, p + cmovcq p, m + cmpq n, p + cmovcq p, n + cmpq n, m + jc bignum_sub_ylonger + +// The case where x is longer or of the same size (p >= m >= n) + + subq m, p + subq n, m + incq m + testq n, n + jz bignum_sub_xtest +bignum_sub_xmainloop: + movq (x,i,8), a + sbbq (y,i,8), a + movq a, (z,i,8) + incq i + decq n + jnz bignum_sub_xmainloop + jmp bignum_sub_xtest +bignum_sub_xtoploop: + movq (x,i,8), a + sbbq $0, a + movq a, (z,i,8) + incq i +bignum_sub_xtest: + decq m + jnz bignum_sub_xtoploop + sbbq a, a + testq p, p + jz bignum_sub_tailskip +bignum_sub_tailloop: + movq a, (z,i,8) + incq i + decq p + jnz bignum_sub_tailloop +bignum_sub_tailskip: + negq a +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +// The case where y is longer (p >= n > m) + +bignum_sub_ylonger: + + subq n, p + subq m, n + testq m, m + jz bignum_sub_ytoploop +bignum_sub_ymainloop: + movq (x,i,8), a + sbbq (y,i,8), a + movq a, (z,i,8) + incq i + decq m + jnz bignum_sub_ymainloop +bignum_sub_ytoploop: + movl $0, ashort + sbbq (y,i,8), a + movq a, (z,i,8) + incq i + decq n + jnz bignum_sub_ytoploop + sbbq a, a + testq p, p + jnz bignum_sub_tailloop + negq a +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_bytereverse.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_bytereverse.S new file mode 100644 index 00000000000..8f22655043a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_bytereverse.S @@ -0,0 +1,41 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reverse the order of bytes in a 64-bit word +// +// extern uint64_t word_bytereverse (uint64_t a); +// +// Standard x86-64 ABI: RDI = a, returns RAX +// Microsoft x64 ABI: RCX = a, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_bytereverse) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_bytereverse) + .text + +// Just uses the x86 BSWAP instruction, which does the job directly + +S2N_BN_SYMBOL(word_bytereverse): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi +#endif + + movq %rdi, %rax + bswapq %rax +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_clz.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_clz.S new file mode 100644 index 00000000000..8b613fc4194 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_clz.S @@ -0,0 +1,49 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count leading zero bits in a single word +// Input a; output function return +// +// extern uint64_t word_clz (uint64_t a); +// +// Standard x86-64 ABI: RDI = a, returns RAX +// Microsoft x64 ABI: RCX = a, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_clz) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_clz) + .text + +S2N_BN_SYMBOL(word_clz): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi +#endif + +// First do %rax = 63 - bsr(a), which is right except (maybe) for zero inputs + + bsrq %rdi, %rax + xorq $63, %rax + +// Force return of 64 in the zero-input case + + movl $64, %edx + testq %rdi, %rdi + cmoveq %rdx, %rax + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_ctz.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_ctz.S new file mode 100644 index 00000000000..be1db1491fb --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_ctz.S @@ -0,0 +1,48 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count trailing zero bits in a single word +// Input a; output function return +// +// extern uint64_t word_ctz (uint64_t a); +// +// Standard x86-64 ABI: RDI = a, returns RAX +// Microsoft x64 ABI: RCX = a, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_ctz) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_ctz) + .text + +S2N_BN_SYMBOL(word_ctz): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi +#endif + +// First just do %rax = bsf(a), which is right except (maybe) for zero inputs + + bsfq %rdi, %rax + +// Force return of 64 in the zero-input case + + movl $64, %edx + testq %rdi, %rdi + cmoveq %rdx, %rax + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_divstep59.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_divstep59.S new file mode 100644 index 00000000000..139c83b7a5f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_divstep59.S @@ -0,0 +1,402 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Perform 59 "divstep" iterations and return signed matrix of updates +// Inputs d, f, g; output m[2][2] and function return +// +// extern int64_t word_divstep59 +// (int64_t m[2][2],int64_t d,uint64_t f,uint64_t g); +// +// Standard x86-64 ABI: RDI = m, RSI = d, RDX = f, RCX = g, returns RAX +// Microsoft x64 ABI: RCX = m, RDX = d, R8 = f, R9 = g, returns RAX +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_divstep59) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_divstep59) + .text + +#define mat %rdi + +#define d %rsi +#define fuv %rbx +#define grs %rcx + +#define f %r12 +#define g %r13 + +#define m %r8 +#define t %r9 + +#define zero %rbp +#define zeroe %ebp +#define minus2 %rax +#define minus2e %eax +#define plus2 %rdx +#define plus2e %edx + +#define m00 %r8 +#define m01 %r9 +#define m10 %r10 +#define m11 %r11 + +S2N_BN_SYMBOL(word_divstep59): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + +// Save extra registers + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + +// Pack f and g into single registers with (negated) update matrices, +// initially the identity matrix. The f_lo and g_lo are initially +// the 20 lowest bits of f and g. +// +// fuv = f_lo - 2^41 * 1 - 2^62 * 0 +// grs = g_lo - 2^41 * 0 - 2^62 * 1 + + movq %rdx, fuv + movq %rdx, f + andq $0xFFFFF, fuv + movq $0xFFFFFE0000000000, %rax + orq %rax, fuv + + movq %rcx, g + andq $0xFFFFF, grs + movq $0xc000000000000000, %rax + orq %rax, grs + +// Now do 20 divsteps on that packed format. +// +// At the i'th iteration (starting at i = 0, ending at i = 20) +// the intermediate packed values are of the form +// +// fuv = f_lo - 2^{41-i} * m00 - 2^{62-i} * m01 +// grs = g_lo - 2^{41-i} * m10 - 2^{62-i} * m11 +// +// where the following matrix indicates the updates to apply +// to the original (full-sized) f and g for those iterations. +// +// [m00 m01] * [f_0] = [f_i] +// [m10 m11] [g_0] [g_i] + + movq $-2, minus2 + xorl zeroe, zeroe + movl $2, plus2e + movq fuv, t + movq minus2, m + testq d, d + cmovs zero, m + testq $1, grs +.set i, 0 +.rep 20 + cmovzq zero, m + cmovzq zero, t +.if (i != 0) + sarq $1, grs +.endif + xorq m, t + xorq m, d + btq $63, m + cmovcq grs, fuv + movq minus2, m + addq plus2, d + leaq (grs,t), grs +.if (i != 19) + cmovs zero, m + movq fuv, t + testq plus2, grs +.endif +.set i, (i+1) +.endr + sarq $1, grs + +// Extract the matrix entries, but keep them in negated form. +// Store them in the output buffer temporarily. + + movl $1048576, %eax + leaq (fuv,%rax), m00 + leaq (grs,%rax), m10 + shlq $22, m00 + shlq $22, m10 + sarq $43, m00 + sarq $43, m10 + + movq $2199024304128, %rax + leaq (fuv,%rax), m01 + leaq (grs,%rax), m11 + sarq $42, m01 + sarq $42, m11 + + movq m00, (mat) + movq m01, 8(mat) + movq m10, 16(mat) + movq m11, 24(mat) + +// Compute updated f and g using the negated matrix entries; +// this flips the signs of f and g but it doesn't matter. +// +// f = (m00 * f + m01 * g) / 2^20 +// g = (m10 * f + m11 * g) / 2^20 +// +// Since we only need another 40 bits, we can do all of that +// computation naively using (implicitly signed) 64-bit words. + + imulq f, m10 + imulq m00, f + imulq g, m01 + imulq m11, g + addq m01, f + addq m10, g + sarq $20, f + sarq $20, g + +// Re-pack for 20 more rounds + + movq f, fuv + andq $0xFFFFF, fuv + movq $0xFFFFFE0000000000, %rax + orq %rax, fuv + + movq g, grs + andq $0xFFFFF, grs + movq $0xc000000000000000, %rax + orq %rax, grs + +// Second block of 20 divsteps in the same style + + movq $-2, minus2 + movl $2, plus2e + movq fuv, t + movq minus2, m + testq d, d + cmovs zero, m + testq $1, grs +.set i, 0 +.rep 20 + cmovzq zero, m + cmovzq zero, t +.if (i != 0) + sarq $1, grs +.endif + xorq m, t + xorq m, d + btq $63, m + cmovcq grs, fuv + movq minus2, m + addq plus2, d + leaq (grs,t), grs +.if (i != 19) + cmovs zero, m + movq fuv, t + testq plus2, grs +.endif +.set i, (i+1) +.endr + sarq $1, grs + +// Extract the next matrix entries, in negated form again + + movl $1048576, %eax + leaq (fuv,%rax), m00 + leaq (grs,%rax), m10 + shlq $22, m00 + shlq $22, m10 + sarq $43, m00 + sarq $43, m10 + + movq $2199024304128, %rax + leaq (fuv,%rax), m01 + leaq (grs,%rax), m11 + sarq $42, m01 + sarq $42, m11 + +// Compute updated f and g using the negated matrix entries, +// and so again flipping (thus actually restoring) the signs. +// +// f = (n00 * f + n01 * g) / 2^20 +// g = (n10 * f + n11 * g) / 2^20 + + movq g, fuv + movq f, grs + imulq m00, f + imulq m01, fuv + addq fuv, f + imulq m11, g + imulq m10, grs + addq grs, g + sarq $20, f + sarq $20, g + +// Re-pack for 20 more rounds + + movq f, fuv + andq $0xFFFFF, fuv + movq $0xFFFFFE0000000000, %rax + orq %rax, fuv + + movq g, grs + andq $0xFFFFF, grs + movq $0xc000000000000000, %rax + orq %rax, grs + +// Multiply the first two matrices, and re-store in the output buffer. +// +// [m00_new m01_new] = [m00 m01] * [m00_prev m01_prev] +// [m10_new m11_new] [m10 m11] [m10_prev m11_prev] +// +// The resulting matrix entries are: +// +// m00_new = m00 * m00_prev + m01 * m10_prev +// m01_new = m00 * m01_prev + m01 * m11_prev +// m10_new = m10 * m00_prev + m11 * m10_prev +// m11_new = m10 * m01_prev + m11 * m11_prev +// +// At this point the sign is right since both matrices were negated. + + movq (mat), %rax + imulq m00, %rax + movq 16(mat), %rdx + imulq m01, %rdx + imulq 8(mat), m00 + imulq 24(mat), m01 + addq m00, m01 + leaq (%rax,%rdx), m00 + + movq (mat), %rax + imulq m10, %rax + movq 16(mat), %rdx + imulq m11, %rdx + imulq 8(mat), m10 + imulq 24(mat), m11 + addq m10, m11 + leaq (%rax,%rdx), m10 + + movq m00, (mat) + movq m01, 8(mat) + movq m10, 16(mat) + movq m11, 24(mat) + +// Third block of divsteps, same style but a total of 19 not 20 + + movq $-2, minus2 + movl $2, plus2e + movq fuv, t + movq minus2, m + testq d, d + cmovs zero, m + testq $1, grs +.set i, 0 +.rep 19 + cmovzq zero, m + cmovzq zero, t +.if (i != 0) + sarq $1, grs +.endif + xorq m, t + xorq m, d + btq $63, m + cmovcq grs, fuv + movq minus2, m + addq plus2, d + leaq (grs,t), grs +.if (i != 18) + cmovs zero, m + movq fuv, t + testq plus2, grs +.endif +.set i, (i+1) +.endr + sarq $1, grs + +// Extract the matrix entries from the final 19 divsteps + + movl $1048576, %eax + leaq (fuv,%rax), m00 + leaq (grs,%rax), m10 + shlq $21, m00 + shlq $21, m10 + sarq $43, m00 + sarq $43, m10 + + movq $2199024304128, %rax + leaq (fuv,%rax), m01 + leaq (grs,%rax), m11 + sarq $43, m01 + sarq $43, m11 + +// Multiply by this new matrix +// +// [m00_new m01_new] = [m00 m01] * [m00_prev m01_prev] +// [m10_new m11_new] [m10 m11] [m10_prev m11_prev] +// +// The resulting matrix entries are: +// +// m00_new = m00 * m00_prev + m01 * m10_prev +// m01_new = m00 * m01_prev + m01 * m11_prev +// m10_new = m10 * m00_prev + m11 * m10_prev +// m11_new = m10 * m01_prev + m11 * m11_prev +// +// Since we didn't negate the n matrix, all products are negated +// and so we insert negations + + movq (mat), %rax + imulq m00, %rax + movq 16(mat), %rdx + imulq m01, %rdx + imulq 8(mat), m00 + imulq 24(mat), m01 + addq m00, m01 + leaq (%rax,%rdx), m00 + negq m01 + negq m00 + + movq (mat), %rax + imulq m10, %rax + movq 16(mat), %rdx + imulq m11, %rdx + imulq 8(mat), m10 + imulq 24(mat), m11 + addq m10, m11 + leaq (%rax,%rdx), m10 + negq m11 + negq m10 + +// Now write back the final matrix and d for the whole 59 steps + + movq m00, (mat) + movq m01, 8(mat) + movq m10, 16(mat) + movq m11, 24(mat) + movq d, %rax + +// Restore registers and return + + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_max.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_max.S new file mode 100644 index 00000000000..020be639129 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_max.S @@ -0,0 +1,45 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Return maximum of two unsigned 64-bit words +// Inputs a, b; output function return +// +// extern uint64_t word_max (uint64_t a, uint64_t b); +// +// Standard x86-64 ABI: RDI = a, RSI = b, returns RAX +// Microsoft x64 ABI: RCX = a, RDX = b, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_max) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_max) + .text + +#define a %rdi +#define b %rsi + +S2N_BN_SYMBOL(word_max): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + + movq a, %rax + cmpq b, a + cmovcq b, %rax +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_min.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_min.S new file mode 100644 index 00000000000..9944383c822 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_min.S @@ -0,0 +1,45 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Return minimum of two unsigned 64-bit words +// Inputs a, b; output function return +// +// extern uint64_t word_min (uint64_t a, uint64_t b); +// +// Standard x86-64 ABI: RDI = a, RSI = b, returns RAX +// Microsoft x64 ABI: RCX = a, RDX = b, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_min) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_min) + .text + +#define a %rdi +#define b %rsi + +S2N_BN_SYMBOL(word_min): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + + movq a, %rax + cmpq b, a + cmovncq b, %rax +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_negmodinv.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_negmodinv.S new file mode 100644 index 00000000000..f8b9598597d --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_negmodinv.S @@ -0,0 +1,76 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Single-word negated modular inverse (-1/a) mod 2^64 +// Input a; output function return +// +// extern uint64_t word_negmodinv (uint64_t a); +// +// A 64-bit function that returns a negated multiplicative inverse mod 2^64 +// of its input, assuming that input is odd. Given odd input a, the result z +// will satisfy a * z + 1 == 0 (mod 2^64), i.e. a 64-bit word multiplication +// a * z will give -1. +// +// Standard x86-64 ABI: RDI = a, returns RAX +// Microsoft x64 ABI: RCX = a, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_negmodinv) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_negmodinv) + .text + +S2N_BN_SYMBOL(word_negmodinv): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi +#endif + +// Initial magical 5-bit approximation x = (a - a<<2) xor 2 + + movq %rdi, %rcx + movq %rdi, %rax + shlq $2, %rcx + subq %rcx, %rax + xorq $2, %rax + +// Now refine to 64-bit congruence + + movq %rax, %rcx // %rcx = x + imulq %rdi, %rcx // %rcx = a * x + movl $2, %edx + addq %rcx, %rdx // %rdx = 1 + e = 2 + a * x + addq $1, %rcx // %rcx = e = a * x + 1 + + imulq %rdx, %rax // %rax = x * (1 + e) + + imulq %rcx, %rcx // %rcx = e^2 + movl $1, %edx + addq %rcx, %rdx + imulq %rdx, %rax // %rax = x * (1 + e) * (1 + e^2) + + imulq %rcx, %rcx // %rcx = e^4 + movl $1, %edx + addq %rcx, %rdx + imulq %rdx, %rax // %rax = x * (1 + e) * (1 + e^2) * (1 + e^4) + + imulq %rcx, %rcx // %rcx = e^8 + movl $1, %edx + addq %rcx, %rdx + imulq %rdx, %rax // %rax = x * (1 + e) * ... * * (1 + e^8) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_popcount.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_popcount.S new file mode 100644 index 00000000000..9647b2cc862 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_popcount.S @@ -0,0 +1,70 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Count number of set bits in a single 64-bit word (population count) +// Input a; output function return +// +// extern uint64_t word_popcount (uint64_t a); +// +// Standard x86-64 ABI: RDI = a, returns RAX +// Microsoft x64 ABI: RCX = a, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_popcount) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_popcount) + .text + +S2N_BN_SYMBOL(word_popcount): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi +#endif + +// The code is generated by gcc -O3 (version 11.4.0) from +// +// uint64_t word_popcount(uint64_t x) +// { uint64_t x2 = x - ((x & UINT64_C(0xAAAAAAAAAAAAAAAA))>>1); +// uint64_t x4 = (x2 & UINT64_C(0x3333333333333333)) + +// ((x2 & UINT64_C(0xCCCCCCCCCCCCCCCC))>>2); +// uint64_t x8 = (x4 + (x4>>4)) & UINT64_C(0x0F0F0F0F0F0F0F0F); +// uint64_t x64 = x8 * UINT64_C(0x101010101010101); +// uint64_t y = x64>>56; +// return y; +// } + + movabsq $0x5555555555555555, %rdx + movq %rdi, %rax + shrq $1, %rax + andq %rdx, %rax + subq %rax, %rdi + movabsq $0x3333333333333333, %rax + movq %rdi, %rdx + andq %rax, %rdi + shrq $0x2, %rdx + andq %rax, %rdx + addq %rdi, %rdx + movq %rdx, %rax + shrq $0x4, %rax + addq %rdx, %rax + movabsq $0xf0f0f0f0f0f0f0f, %rdx + andq %rdx, %rax + movabsq $0x101010101010101, %rdx + imulq %rdx, %rax + shrq $0x38, %rax + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_recip.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_recip.S new file mode 100644 index 00000000000..dc2c0f91813 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/generic/word_recip.S @@ -0,0 +1,144 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Single-word reciprocal, underestimate of 2^128 / a with implicit 1 added +// Input a; output function return +// +// extern uint64_t word_recip (uint64_t a); +// +// Given an input word "a" with its top bit set (i.e. 2^63 <= a < 2^64), the +// result "x" is implicitly augmented with a leading 1 giving x' = 2^64 + x. +// The result is x' = ceil(2^128 / a) - 1, which except for the single +// special case a = 2^63 is the same thing as x' = floor(2^128 / a). +// +// Standard x86-64 ABI: RDI = a, returns RAX +// Microsoft x64 ABI: RCX = a, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(word_recip) + S2N_BN_SYM_PRIVACY_DIRECTIVE(word_recip) + .text + +#define a %rdi +#define x %rcx +#define b %rsi + +# Some aliasing here + +#define t %rax +#define l %rax + +#define d %rdx +#define h %rdx + +S2N_BN_SYMBOL(word_recip): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi +#endif + +// Scale the input down: b overestimates a/2^16 with b <= 2^48 and +// x underestimates 2^64/b with b * x =~= 2^64, accurate to ~2 bits. + + movq a, b + movq $0x1FFFFFFFFFFFF, x + shrq $16, b + xorq b, x + incq b + shrq $32, x + +// Suppose x = 2^64/b * (1 - e). and get scaled error d = 2^64 * e + + movq b, d + imulq x, d + negq d + +// Rescale to give c = 2^15 * e (so c <= 2^13) and compute +// e + e^2 + e^3 + e^4 = (1 + e^2) (e + e^2) +// = (2^30 + c^2) * (2^15 * c + c^2) / 2^60 +// and then x * (1 + e + e^2 + e^3 + e^4) +// = (2^30 * x + x * (2^30 + c^2) * (2^30 * c + c^2) / 2^30) / 2^30 + + movq d, t + shrq $49, t + imulq t, t + shrq $34, d + addq t, d + orq $0x40000000, t + imulq d, t + shrq $30, t + imulq x, t + shlq $30, x + addq t, x + shrq $30, x + +// Now b * x =~= 2^64, accurate to ~10 bits. +// Do a 64-bit Newton step, scaling up x by 16 bits in the process. + + movq b, d + imulq x, d + negq d + shrq $24, d + imulq x, d + shlq $16, x + shrq $24, d + addq d, x + +// Now b * x =~= 2^80, accurate to ~20 bits. +// Do a 64-bit Newton step, scaling up x by 31 bits in the process + + movq b, d + imulq x, d + negq d + shrq $32, d + imulq x, d + shlq $31, x + shrq $17, d + addq d, x + +// Now a * x =~= 2^127, accurate to ~40 bits. Do a Newton step at full size. +// Instead of literally negating the product (h,l) we complement bits in +// the extracted bitfield, which is close enough and a bit faster. +// At the end we also shift x one more bit left, losing the known-1 top bit +// so that a * (2^64 + x) =~= 2^128. + + movq a, l + mulq x + shrdq $60, h, l + movq x, h + shrq $33, h + notq l + imulq h, l + shlq $1, x + shrq $33, l + addq l, x + +// Test if (x' + 1) * a < 2^128 where x' = 2^64 + x, catching the special +// case where x + 1 would wrap, corresponding to input a = 2^63. + + addq $1, x + movq a, l + sbbq $0, x + mulq x + movq x, %rax + addq a, h + +// Select either x or x + 1 accordingly as the final answer + + sbbq $0, %rax +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_add_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_add_p256.S new file mode 100644 index 00000000000..76ab05b3430 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_add_p256.S @@ -0,0 +1,100 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Add modulo p_256, z := (x + y) mod p_256, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_add_p256 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p256) + .text + +#define z %rdi +#define x %rsi +#define y %rdx + +#define d0 %rax +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n1 %r10 +#define n3 %rdx +#define c %r11 + +#define n1short %r10d + + + +S2N_BN_SYMBOL(bignum_add_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Load and add the two inputs as 2^256 * c + [d3;d2;d1;d0] = x + y + + xorq c, c + movq (x), d0 + addq (y), d0 + movq 8(x), d1 + adcq 8(y), d1 + movq 16(x), d2 + adcq 16(y), d2 + movq 24(x), d3 + adcq 24(y), d3 + adcq c, c + +// Now subtract 2^256 * c + [d3;d3;d1;d1] = x + y - p_256 +// The constants n1 and n3 in [n3; 0; n1; -1] = p_256 are saved for later + + subq $-1, d0 + movl $0x00000000ffffffff, n1short + sbbq n1, d1 + sbbq $0, d2 + movq $0xffffffff00000001, n3 + sbbq n3, d3 + +// Since by hypothesis x < p_256 we know x + y - p_256 < 2^256, so the top +// carry c actually gives us a bitmask for x + y - p_256 < 0, which we +// now use to make a masked p_256' = [n3; 0; n1; c] + + sbbq $0, c + andq c, n1 + andq c, n3 + +// Do the corrective addition and copy to output + + addq c, d0 + movq d0, (z) + adcq n1, d1 + movq d1, 8(z) + adcq $0, d2 + movq d2, 16(z) + adcq n3, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_bigendian_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_bigendian_4.S new file mode 100644 index 00000000000..01684e73773 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_bigendian_4.S @@ -0,0 +1,88 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert 4-digit (256-bit) bignum to/from big-endian form +// Input x[4]; output z[4] +// +// extern void bignum_bigendian_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The same function is given two other prototypes whose names reflect the +// treatment of one or other argument as a byte array rather than word array: +// +// extern void bignum_frombebytes_4 +// (uint64_t z[static 4], uint8_t x[static 32]); +// +// extern void bignum_tobebytes_4 +// (uint8_t z[static 32], uint64_t x[static 4]); +// +// Since x86 is little-endian, and bignums are stored with little-endian +// word order, this is simply byte reversal and is implemented as such. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_bigendian_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_bigendian_4) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_frombebytes_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_frombebytes_4) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tobebytes_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tobebytes_4) + + .text + +#define z %rdi +#define x %rsi +#define a %rax +#define b %rdx + +// All loads and stores are word-sized, then we use BSWAP to +// reverse the byte order, as well as switching round the word order +// when writing back. The reads and writes are organized in mirror-image +// pairs (0-3 and 1-2) to allow x and z to point to the same buffer +// without using more intermediate registers. + +S2N_BN_SYMBOL(bignum_bigendian_4): +S2N_BN_SYMBOL(bignum_frombebytes_4): +S2N_BN_SYMBOL(bignum_tobebytes_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// 0 and 3 words + + movq (x), a + movq 24(x), b + bswapq a + bswapq b + movq a, 24(z) + movq b, (z) + +// 1 and 2 words + + movq 8(x), a + movq 16(x), b + bswapq a + bswapq b + movq a, 16(z) + movq b, 8(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_cmul_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_cmul_p256.S new file mode 100644 index 00000000000..19883af3c24 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_cmul_p256.S @@ -0,0 +1,129 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming +// x reduced +// Inputs c, x[4]; output z[4] +// +// extern void bignum_cmul_p256 +// (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = c, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256) + .text + +#define z %rdi + +// Temporarily moved here for initial multiply +#define x %rcx +// Likewise this is thrown away after initial multiply +#define m %rdx + +#define a %rax +#define c %rcx + +#define d0 %rsi +#define d1 %r8 +#define d2 %r9 +#define d3 %r10 +#define h %r11 + +#define ashort %eax + +// Multiplier again for second stage +#define q %rdx + + +S2N_BN_SYMBOL(bignum_cmul_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Shuffle inputs (since we want multiplier in %rdx) + + movq %rdx, x + movq %rsi, m + +// Multiply, accumulating the result as 2^256 * h + [d3;d2;d1;d0] + + mulxq (x), d0, d1 + mulxq 8(x), a, d2 + addq a, d1 + mulxq 16(x), a, d3 + adcq a, d2 + mulxq 24(x), a, h + adcq a, d3 + adcq $0, h + +// Writing the product as z = 2^256 * h + 2^192 * d3 + t = 2^192 * hl + t, our +// intended quotient approximation is (hl + hl>>32 + 1)>>64. Note that by +// hypothesis our product is <= (2^64 - 1) * (p_256 - 1), so there is no need +// to max this out to avoid wrapping. + + movq h, a + shldq $32, d3, a + movq h, q + shrq $32, q + + xorq c, c + subq $1, c + + adcq d3, a + adcq h, q + +// Now compute the initial pre-reduced result z - p_256 * q +// = z - (2^256 - 2^224 + 2^192 + 2^96 - 1) * q +// = z - 2^192 * 0xffffffff00000001 * q - 2^64 * 0x0000000100000000 * q + q + + addq q, d0 + movq $0x0000000100000000, a + mulxq a, a, c + sbbq $0, a + sbbq $0, c + subq a, d1 + sbbq c, d2 + movq $0xffffffff00000001, a + mulxq a, a, c + sbbq a, d3 + sbbq c, h + +// Now our top word h is either zero or all 1s, and we use this to discriminate +// whether a correction is needed because our result is negative, as a bitmask +// Do a masked addition of p_256 and write back + + movl $0x00000000ffffffff, ashort + andq h, a + xorq c, c + subq a, c + addq h, d0 + movq d0, (z) + adcq a, d1 + movq d1, 8(z) + adcq $0, d2 + movq d2, 16(z) + adcq c, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_cmul_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_cmul_p256_alt.S new file mode 100644 index 00000000000..d68c947402a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_cmul_p256_alt.S @@ -0,0 +1,146 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word modulo p_256, z := (c * x) mod p_256, assuming +// x reduced +// Inputs c, x[4]; output z[4] +// +// extern void bignum_cmul_p256_alt +// (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = c, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256_alt) + .text + +#define z %rdi + +// Temporarily moved here for initial multiply then thrown away + +#define x %rcx +#define m %rsi + +// Other variables + +#define d %rdx +#define a %rax +#define c %rcx + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 +#define h %rsi + +#define ashort %eax +#define hshort %esi + +// Multiplier again for second stage + +#define q %rcx + +S2N_BN_SYMBOL(bignum_cmul_p256_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Shuffle inputs (since we want %rdx for the high parts of products) + + movq %rdx, x + +// Multiply, accumulating the result as 2^256 * h + [d3;d2;d1;d0] + + movq (x), a + mulq m + movq a, d0 + movq d, d1 + + movq 8(x), a + mulq m + xorq d2, d2 + addq a, d1 + adcq d, d2 + + movq 16(x), a + mulq m + xorq d3, d3 + addq a, d2 + adcq d, d3 + + movq 24(x), a + mulq m + xorl hshort, hshort + addq a, d3 + adcq d, h + +// Writing the product as z = 2^256 * h + 2^192 * d3 + t = 2^192 * hl + t, our +// intended quotient approximation is (hl + hl>>32 + 1)>>64. Note that by +// hypothesis our product is <= (2^64 - 1) * (p_256 - 1), so there is no need +// to max this out to avoid wrapping. + + movq h, a + shldq $32, d3, a + movq h, q + shrq $32, q + xorq d, d + subq $1, d + adcq d3, a + adcq h, q + +// Now compute the initial pre-reduced result z - p_256 * q +// = z - (2^256 - 2^224 + 2^192 + 2^96 - 1) * q +// = z - 2^192 * 0xffffffff00000001 * q - 2^64 * 0x0000000100000000 * q + q + + movq $0x0000000100000000, a + mulq q + addq q, d0 + sbbq $0, a + sbbq $0, d + subq a, d1 + sbbq d, d2 + sbbq $0, d3 + sbbq $0, h + movq $0xffffffff00000001, a + mulq q + subq a, d3 + sbbq d, h + +// Now our top word h is either zero or all 1s, and we use this to discriminate +// whether a correction is needed because our result is negative, as a bitmask +// Do a masked addition of p_256 and write back + + movl $0x00000000ffffffff, ashort + andq h, a + xorq c, c + subq a, c + addq h, d0 + movq d0, (z) + adcq a, d1 + movq d1, 8(z) + adcq $0, d2 + movq d2, 16(z) + adcq c, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_deamont_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_deamont_p256.S new file mode 100644 index 00000000000..6c0e66ec23f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_deamont_p256.S @@ -0,0 +1,145 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_deamont_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form, +// "almost" meaning any 4-digit input will work, with no range restriction. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p256) + .text + +#define z %rdi +#define x %rsi + +// Re-use these as temporaries in the correction phase + +#define d %rdx +#define u %r10 +#define v %r11 + +#define dshort %edx +#define ushort %r10d + +// Add %rdx * m into a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rcx as temporaries + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rcx ; \ + adcxq %rax, low ; \ + adoxq %rcx, high + +S2N_BN_SYMBOL(bignum_deamont_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save one more register to play with + + pushq %rbx + +// Set up an initial 4-word window [%r11,%r10,%r9,%r8] = x + + movq (x), %r8 + movq 8(x), %r9 + movq 16(x), %r10 + movq 24(x), %r11 + +// Fill in two zeros to the left + + xorq %rbx, %rbx + xorq %rsi, %rsi + +// Montgomery reduce windows 0 and 1 together + + movq $0x0000000100000000, %rdx + mulpadd(%r10,%r9,%r8) + mulpadd(%r11,%r10,%r9) + movq $0xffffffff00000001, %rdx + mulpadd(%rbx,%r11,%r8) + mulpadd(%rsi,%rbx,%r9) + movl $0, %r8d + adcxq %r8, %rsi + +// Append just one more leading zero (by the above %r8 = 0 already). + + xorq %r9, %r9 + +// Montgomery reduce windows 2 and 3 together + + movq $0x0000000100000000, %rdx + mulpadd(%rbx,%r11,%r10) + mulpadd(%rsi,%rbx,%r11) + movq $0xffffffff00000001, %rdx + mulpadd(%r8,%rsi,%r10) + mulpadd(%r9,%r8,%r11) + movl $0, %r10d + adcxq %r10, %r9 + +// We now have a pre-reduced dd = [%r9;%r8;%rsi;%rbx]. Load non-trivial digits +// of p_256 = [v; 0; u; -1] + + movl $0x00000000ffffffff, ushort + movq $0xffffffff00000001, v + +// Now do the subtraction (p_256-1) - (%r9;%r8;%rsi;%rbx) to get the carry + + movq $-2, d + subq %rbx, d + movq u, d + sbbq %rsi, d + movl $0, dshort + sbbq %r8, d + movq v, d + sbbq %r9, d + +// Convert the carry CF <=> dd >= p_256 to a bitmask and do a masked subtraction + + sbbq d, d + andq d, u + andq d, v + + subq d, %rbx + sbbq u, %rsi + sbbq $0, %r8 + sbbq v, %r9 + +// Write back + + movq %rbx, (z) + movq %rsi, 8(z) + movq %r8, 16(z) + movq %r9, 24(z) + +// Restore saved register and return + + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_deamont_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_deamont_p256_alt.S new file mode 100644 index 00000000000..a02ce2f2fab --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_deamont_p256_alt.S @@ -0,0 +1,158 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from almost-Montgomery form, z := (x / 2^256) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_deamont_p256_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form, +// "almost" meaning any 4-digit input will work, with no range restriction. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p256_alt) + .text + +#define z %rdi +#define x %rsi + +// Re-use these as temporaries in the correction phase + +#define d %rdx +#define u %rax +#define v %rcx + +#define dshort %edx +#define ushort %eax + +// Add %rdx * m into a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rcx as temporaries + +#define mulpado(high,low,m) \ + mulxq m, %rax, %rcx ; \ + adcxq %rax, low ; \ + adoxq %rcx, high + +// Add %rcx * m into a register-pair (high,low) maintaining consistent +// carry-catching with carry (negated, as bitmask) and using %rax and %rdx +// as temporaries + +#define mulpadd(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rcx; \ + subq carry, %rdx ; \ + addq %rax, low ; \ + adcq %rdx, high ; \ + sbbq carry, carry + +// Initial version assuming no carry-in + +#define mulpadi(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rcx; \ + addq %rax, low ; \ + adcq %rdx, high ; \ + sbbq carry, carry + +// Version with no carry in or out + +#define mulpadn(high,low,m) \ + movq m, %rax ; \ + mulq %rcx; \ + addq %rax, low ; \ + adcq %rdx, high + +S2N_BN_SYMBOL(bignum_deamont_p256_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Set up an initial 4-word window [%r11,%r10,%r9,%r8] = x + + movq (x), %r8 + movq 8(x), %r9 + movq 16(x), %r10 + movq 24(x), %r11 + +// Load constant 2^32; %rcx toggles between this and (1 - %rcx) below + + movq $0x0000000100000000, %rcx + +// Montgomery reduce windows 0 and 1 together as [%r8;%rsi;%r11;%r10] + + mulpadi(%rsi,%r10,%r9,%r8) + mulpadd(%rsi,%r11,%r10,%r9) + negq %rcx + negq %rsi + incq %rcx + mulpadi(%r8,%rsi,%r11,%r8) + negq %r8 + mulpadn(%r8,%rsi,%r9) + +// Montgomery reduce windows 2 and 3 together as [%r10;%r9;%r8;%rsi] + + negq %rcx + incq %rcx + mulpadi(%r9,%rsi,%r11,%r10) + mulpadd(%r9,%r8,%rsi,%r11) + negq %rcx + negq %r9 + incq %rcx + mulpadi(%r10,%r9,%r8,%r10) + negq %r10 + mulpadn(%r10,%r9,%r11) + +// We now have a pre-reduced result z = [%r10;%r9;%r8;%rsi]. +// From the above we have %rcx = 0xffffffff00000001, which we use to generate +// [0x00000000fffffffe; -1; 0xffffffff00000000; 1] = 2^256 - p_256 and +// then compute [%rcx;%rdx;%r11;%rax] = z + (2^256 - p_256) + + xorl %edx, %edx + leaq 1(%rdx), %rax + addq %rsi, %rax + leaq -1(%rcx), %r11 + adcq %r8, %r11 + notq %rdx + adcq %r9, %rdx + notq %rcx + adcq %r10, %rcx + +// CF is set iff z + (2^256 - p_256) >= 2^256, i.e. if z >= p_256. +// If so we want the result of the subtraction (in 4 words) + + cmovcq %rax, %rsi + cmovcq %r11, %r8 + cmovcq %rdx, %r9 + cmovcq %rcx, %r10 + +// Write back + + movq %rsi, (z) + movq %r8, 8(z) + movq %r9, 16(z) + movq %r10, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_demont_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_demont_p256.S new file mode 100644 index 00000000000..d9bc8c66e06 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_demont_p256.S @@ -0,0 +1,112 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_demont_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// This assumes the input is < p_256 for correctness. If this is not the case, +// use the variant "bignum_deamont_p256" instead. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_p256) + .text + +#define z %rdi +#define x %rsi + +// Add %rdx * m into a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rcx as temporaries + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rcx ; \ + adcxq %rax, low ; \ + adoxq %rcx, high + +S2N_BN_SYMBOL(bignum_demont_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save one more register to play with + + pushq %rbx + +// Set up an initial 4-word window [%r11,%r10,%r9,%r8] = x + + movq (x), %r8 + movq 8(x), %r9 + movq 16(x), %r10 + movq 24(x), %r11 + +// Fill in two zeros to the left + + xorq %rbx, %rbx + xorq %rsi, %rsi + +// Montgomery reduce windows 0 and 1 together + + movq $0x0000000100000000, %rdx + mulpadd(%r10,%r9,%r8) + mulpadd(%r11,%r10,%r9) + movq $0xffffffff00000001, %rdx + mulpadd(%rbx,%r11,%r8) + mulpadd(%rsi,%rbx,%r9) + movl $0, %r8d + adcxq %r8, %rsi + +// Append just one more leading zero (by the above %r8 = 0 already). + + xorq %r9, %r9 + +// Montgomery reduce windows 2 and 3 together + + movq $0x0000000100000000, %rdx + mulpadd(%rbx,%r11,%r10) + mulpadd(%rsi,%rbx,%r11) + movq $0xffffffff00000001, %rdx + mulpadd(%r8,%rsi,%r10) + mulpadd(%r9,%r8,%r11) + movl $0, %r10d + adcxq %r10, %r9 + +// Since the input was assumed reduced modulo, i.e. < p, we actually know that +// 2^256 * [carries; %r9;%r8;%rsi;%rbx] is <= (p - 1) + (2^256 - 1) p +// and hence [carries; %r9;%r8;%rsi;%rbx] < p. This means in fact carries = 0 +// and [%r9;%r8;%rsi;%rbx] is already our answer, without further correction. +// Write that back. + + movq %rbx, (z) + movq %rsi, 8(z) + movq %r8, 16(z) + movq %r9, 24(z) + +// Restore saved register and return + + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_demont_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_demont_p256_alt.S new file mode 100644 index 00000000000..f53228cb9fd --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_demont_p256_alt.S @@ -0,0 +1,130 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from Montgomery form z := (x / 2^256) mod p_256, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_demont_p256_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// This assumes the input is < p_256 for correctness. If this is not the case, +// use the variant "bignum_deamont_p256" instead. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_p256_alt) + .text + +#define z %rdi +#define x %rsi + +// Add %rdx * m into a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rcx as temporaries + +#define mulpado(high,low,m) \ + mulxq m, %rax, %rcx ; \ + adcxq %rax, low ; \ + adoxq %rcx, high + +// Add %rcx * m into a register-pair (high,low) maintaining consistent +// carry-catching with carry (negated, as bitmask) and using %rax and %rdx +// as temporaries + +#define mulpadd(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rcx; \ + subq carry, %rdx ; \ + addq %rax, low ; \ + adcq %rdx, high ; \ + sbbq carry, carry + +// Initial version assuming no carry-in + +#define mulpadi(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rcx; \ + addq %rax, low ; \ + adcq %rdx, high ; \ + sbbq carry, carry + +// Version with no carry in or out + +#define mulpadn(high,low,m) \ + movq m, %rax ; \ + mulq %rcx; \ + addq %rax, low ; \ + adcq %rdx, high + +S2N_BN_SYMBOL(bignum_demont_p256_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Set up an initial 4-word window [%r11,%r10,%r9,%r8] = x + + movq (x), %r8 + movq 8(x), %r9 + movq 16(x), %r10 + movq 24(x), %r11 + +// Load constant 2^32; %rcx toggles between this and (1 - %rcx) below + + movq $0x0000000100000000, %rcx + +// Montgomery reduce windows 0 and 1 together as [%r8;%rsi;%r11;%r10] + + mulpadi(%rsi,%r10,%r9,%r8) + mulpadd(%rsi,%r11,%r10,%r9) + negq %rcx + negq %rsi + incq %rcx + mulpadi(%r8,%rsi,%r11,%r8) + negq %r8 + mulpadn(%r8,%rsi,%r9) + +// Montgomery reduce windows 2 and 3 together as [%r10;%r9;%r8;%rsi] + + negq %rcx + incq %rcx + mulpadi(%r9,%rsi,%r11,%r10) + mulpadd(%r9,%r8,%rsi,%r11) + negq %rcx + negq %r9 + incq %rcx + mulpadi(%r10,%r9,%r8,%r10) + negq %r10 + mulpadn(%r10,%r9,%r11) + +// Since the input was assumed reduced modulo, i.e. < p, we actually know that +// 2^256 * [carries; %r10;%r9;%r8;%rsi] is <= (p - 1) + (2^256 - 1) p +// and hence [carries; %r10;%r9;%r8;%rsi] < p. This means in fact carries = 0 +// and [%r10;%r9;%r8;%rsi] is already our answer, without further correction. +// Write that back. + + movq %rsi, (z) + movq %r8, 8(z) + movq %r9, 16(z) + movq %r10, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_double_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_double_p256.S new file mode 100644 index 00000000000..8c1b2cf2959 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_double_p256.S @@ -0,0 +1,99 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Double modulo p_256, z := (2 * x) mod p_256, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_double_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_p256) + .text + +#define z %rdi +#define x %rsi + +#define d0 %rdx +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n1 %r10 +#define n3 %r11 +#define c %rax + +#define n1short %r10d + + + +S2N_BN_SYMBOL(bignum_double_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the input and double it so that 2^256 * c + [d3;d2;d1;d0] = 2 * x +// Could also consider using shld to decouple carries + + xorq c, c + movq (x), d0 + addq d0, d0 + movq 8(x), d1 + adcq d1, d1 + movq 16(x), d2 + adcq d2, d2 + movq 24(x), d3 + adcq d3, d3 + adcq c, c + +// Now subtract 2^256 * c + [d3;d3;d1;d1] = 2 * x - p_256 +// The constants n1 and n3 in [n3; 0; n1; -1] = p_256 are saved for later + + subq $-1, d0 + movl $0x00000000ffffffff, n1short + sbbq n1, d1 + sbbq $0, d2 + movq $0xffffffff00000001, n3 + sbbq n3, d3 + +// Since by hypothesis x < p_256 we know 2 * x - p_256 < 2^256, so the top +// carry c actually gives us a bitmask for 2 * x - p_256 < 0, which we +// now use to make a masked p_256' = [n3; 0; n1; c] + + sbbq $0, c + andq c, n1 + andq c, n3 + +// Do the corrective addition and copy to output + + addq c, d0 + movq d0, (z) + adcq n1, d1 + movq d1, 8(z) + adcq $0, d2 + movq d2, 16(z) + adcq n3, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_half_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_half_p256.S new file mode 100644 index 00000000000..2c2da0f9e27 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_half_p256.S @@ -0,0 +1,91 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Halve modulo p_256, z := (x / 2) mod p_256, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_half_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_half_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_half_p256) + .text + +#define z %rdi +#define x %rsi + +#define a %rax +#define d0 %rcx +#define d1 %rdx +#define d2 %r8 +#define d3 %r9 + +#define d0short %ecx +#define d1short %edx + + + +S2N_BN_SYMBOL(bignum_half_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load lowest digit and get a mask for its lowest bit in d0 + + movq (x), a + movl $1, d0short + andq a, d0 + negq d0 + +// Create a masked version of p_256 + + movl $0x00000000ffffffff, d1short + xorq d3, d3 + andq d0, d1 + subq d1, d3 + xorq d2, d2 + +// Perform addition with masked p_256. Catch the carry in a, as a bitmask +// for convenience though we only use its LSB below with SHRD + + addq a, d0 + adcq 8(x), d1 + adcq 16(x), d2 + adcq 24(x), d3 + sbbq a, a + +// Shift right, pushing the carry back down, and store back + + shrdq $1, d1, d0 + movq d0, (z) + shrdq $1, d2, d1 + movq d1, 8(z) + shrdq $1, d3, d2 + movq d2, 16(z) + shrdq $1, a, d3 + movq d3, 24(z) + +// Return + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_inv_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_inv_p256.S new file mode 100644 index 00000000000..c75ef212679 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_inv_p256.S @@ -0,0 +1,1623 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 +// Input x[4]; output z[4] +// +// extern void bignum_inv_p256(uint64_t z[static 4],uint64_t x[static 4]); +// +// If the 4-digit input x is coprime to p_256, i.e. is not divisible +// by it, returns z < p_256 such that x * z == 1 (mod p_256). Note that +// x does not need to be reduced modulo p_256, but the output always is. +// If the input is divisible (i.e. is 0 or p_256), then there can be no +// modular inverse and z = 0 is returned. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p256) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define f 0(%rsp) +#define g (5*N)(%rsp) +#define u (10*N)(%rsp) +#define v (15*N)(%rsp) +#define tmp (20*N)(%rsp) +#define tmp2 (21*N)(%rsp) +#define i (22*N)(%rsp) +#define d (23*N)(%rsp) + +#define mat (24*N)(%rsp) + +// Backup for the input pointer + +#define res (28*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (30*N) + +// Syntactic variants to make x86_att version simpler to generate + +#define F 0 +#define G (5*N) +#define U (10*N) +#define V (15*N) +#define MAT (24*N) + +#define ff (%rsp) +#define gg (5*N)(%rsp) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro from u[4..0] to u[3..0]. +// --------------------------------------------------------------------------- + +#define amontred(P) \ +/* We only know the input is -2^316 < x < 2^316. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_256. */ \ + movq $0xe000000000000000, %r8 ; \ + addq P, %r8 ; \ + movq $0xffffffffffffffff, %r9 ; \ + adcq 8+P, %r9 ; \ + movq $0x000000001fffffff, %r10 ; \ + adcq 16+P, %r10 ; \ + movq $0x2000000000000000, %r11 ; \ + adcq 24+P, %r11 ; \ + movq $0x1fffffffe0000000, %r12 ; \ + adcq 32+P, %r12 ; \ +/* Let [%r8;%rbx] = 2^32 * w and [%rdx;%rax] = (2^64 - 2^32 + 1) * w */ \ +/* where w is the lowest word */ \ + movq %r8, %rbx ; \ + shlq $32, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %r8; \ + shrq $32, %r8 ; \ +/* Hence basic addition of (2^256 - 2^224 + 2^192 + 2^96) * w */ \ + addq %rbx, %r9 ; \ + adcq %r8, %r10 ; \ + adcq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ +/* Now capture carry and subtract p_256 if set (almost-Montgomery) */ \ + sbbq %rax, %rax ; \ + movl $0x00000000ffffffff, %ebx ; \ + andq %rax, %rbx ; \ + movq $0xffffffff00000001, %rdx ; \ + andq %rax, %rdx ; \ + subq %rax, %r9 ; \ + movq %r9, P ; \ + sbbq %rbx, %r10 ; \ + movq %r10, 8+P ; \ + sbbq $0, %r11 ; \ + movq %r11, 16+P ; \ + sbbq %rdx, %r12 ; \ + movq %r12, 24+P + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix as +// +// [ %r8 %r10] +// [ %r12 %r14] +// +// and also returning the matrix still negated (which doesn't matter) + +#define divstep59(din,fin,gin) \ + movq din, %rsi ; \ + movq fin, %rdx ; \ + movq gin, %rcx ; \ + movq %rdx, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + xorl %ebp, %ebp ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %rdx ; \ + leaq (%rcx,%rax), %rdi ; \ + shlq $0x16, %rdx ; \ + shlq $0x16, %rdi ; \ + sarq $0x2b, %rdx ; \ + sarq $0x2b, %rdi ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %rbx ; \ + leaq (%rcx,%rax), %rcx ; \ + sarq $0x2a, %rbx ; \ + sarq $0x2a, %rcx ; \ + movq %rdx, MAT(%rsp) ; \ + movq %rbx, MAT+0x8(%rsp) ; \ + movq %rdi, MAT+0x10(%rsp) ; \ + movq %rcx, MAT+0x18(%rsp) ; \ + movq fin, %r12 ; \ + imulq %r12, %rdi ; \ + imulq %rdx, %r12 ; \ + movq gin, %r13 ; \ + imulq %r13, %rbx ; \ + imulq %rcx, %r13 ; \ + addq %rbx, %r12 ; \ + addq %rdi, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r10 ; \ + shlq $0x16, %r8 ; \ + shlq $0x16, %r10 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r10 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r15 ; \ + leaq (%rcx,%rax), %r11 ; \ + sarq $0x2a, %r15 ; \ + sarq $0x2a, %r11 ; \ + movq %r13, %rbx ; \ + movq %r12, %rcx ; \ + imulq %r8, %r12 ; \ + imulq %r15, %rbx ; \ + addq %rbx, %r12 ; \ + imulq %r11, %r13 ; \ + imulq %r10, %rcx ; \ + addq %rcx, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq MAT(%rsp), %rax ; \ + imulq %r8, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r15, %rdx ; \ + imulq MAT+0x8(%rsp), %r8 ; \ + imulq MAT+0x18(%rsp), %r15 ; \ + addq %r8, %r15 ; \ + leaq (%rax,%rdx), %r9 ; \ + movq MAT(%rsp), %rax ; \ + imulq %r10, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r11, %rdx ; \ + imulq MAT+0x8(%rsp), %r10 ; \ + imulq MAT+0x18(%rsp), %r11 ; \ + addq %r10, %r11 ; \ + leaq (%rax,%rdx), %r13 ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r12 ; \ + shlq $0x15, %r8 ; \ + shlq $0x15, %r12 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r12 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r10 ; \ + leaq (%rcx,%rax), %r14 ; \ + sarq $0x2b, %r10 ; \ + sarq $0x2b, %r14 ; \ + movq %r9, %rax ; \ + imulq %r8, %rax ; \ + movq %r13, %rdx ; \ + imulq %r10, %rdx ; \ + imulq %r15, %r8 ; \ + imulq %r11, %r10 ; \ + addq %r8, %r10 ; \ + leaq (%rax,%rdx), %r8 ; \ + movq %r9, %rax ; \ + imulq %r12, %rax ; \ + movq %r13, %rdx ; \ + imulq %r14, %rdx ; \ + imulq %r15, %r12 ; \ + imulq %r11, %r14 ; \ + addq %r12, %r14 ; \ + leaq (%rax,%rdx), %r12 + +S2N_BN_SYMBOL(bignum_inv_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Create constant [%rdx;%rcx;%rbx;%rax] = p_256 and copy it into the variable f +// including the 5th zero digit + + xorl %ecx, %ecx + movl $0x00000000ffffffff, %edx + movq %rdx, %rbx + leaq -1(%rcx), %rax + negq %rdx + movq %rax, F(%rsp) + movq %rbx, F+8(%rsp) + movq %rcx, F+16(%rsp) + movq %rdx, F+24(%rsp) + movq %rcx, F+32(%rsp) + +// Now reduce the input modulo p_256, first negating the constant to get +// [%rdx;%rcx;%rbx;%rax] = 2^256 - p_256, adding it to x and hence getting +// the comparison x < p_256 <=> (2^256 - p_256) + x < 2^256 and choosing +// g accordingly. + + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + + leaq 1(%rcx), %rax + addq %r8, %rax + leaq -1(%rdx), %rbx + adcq %r9, %rbx + notq %rcx + adcq %r10, %rcx + notq %rdx + adcq %r11, %rdx + + cmovncq %r8, %rax + cmovncq %r9, %rbx + cmovncq %r10, %rcx + cmovncq %r11, %rdx + + movq %rax, G(%rsp) + movq %rbx, G+8(%rsp) + movq %rcx, G+16(%rsp) + movq %rdx, G+24(%rsp) + xorl %eax, %eax + movq %rax, G+32(%rsp) + +// Also maintain reduced < 2^256 vector [u,v] such that +// [f,g] == x * 2^{5*i-50} * [u,v] (mod p_256) +// starting with [p_256,x] == x * 2^{5*0-50} * [0,2^50] (mod p_256) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. + + xorl %eax, %eax + movq %rax, U(%rsp) + movq %rax, U+8(%rsp) + movq %rax, U+16(%rsp) + movq %rax, U+24(%rsp) + + movq $0x0004000000000000, %rcx + movq %rcx, V(%rsp) + movq %rax, V+8(%rsp) + movq %rax, V+16(%rsp) + movq %rax, V+24(%rsp) + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special tenth iteration after a uniform +// first 9. + + movq $10, i + movq $1, d + jmp bignum_inv_p256_midloop + +bignum_inv_p256_loop: + +// Separate out the matrix into sign-magnitude pairs + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in temporary storage for the [u,v] part and do [f,g] first. + + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, tmp + + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, tmp2 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + xorl %ebx, %ebx + movq F(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq G(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq F(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq G(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + +// Digit 1 of [f,g] + + xorl %ecx, %ecx + movq F+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq G+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $59, %rbx, %rdi + movq %rdi, F(%rsp) + + xorl %edi, %edi + movq F+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq G+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $59, %rbp, %rsi + movq %rsi, G(%rsp) + +// Digit 2 of [f,g] + + xorl %esi, %esi + movq F+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq G+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $59, %rcx, %rbx + movq %rbx, F+N(%rsp) + + xorl %ebx, %ebx + movq F+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq G+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $59, %rdi, %rbp + movq %rbp, G+N(%rsp) + +// Digits 3 and 4 of [f,g] + + movq F+3*N(%rsp), %rax + xorq %r9, %rax + movq F+4*N(%rsp), %rbp + xorq %r9, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq G+3*N(%rsp), %rax + xorq %r11, %rax + movq G+4*N(%rsp), %rdx + xorq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $59, %rsi, %rcx + movq %rcx, F+2*N(%rsp) + shrdq $59, %rbp, %rsi + sarq $59, %rbp + + movq F+3*N(%rsp), %rax + movq %rsi, F+3*N(%rsp) + + movq F+4*N(%rsp), %rsi + movq %rbp, F+4*N(%rsp) + + xorq %r13, %rax + xorq %r13, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq G+3*N(%rsp), %rax + xorq %r15, %rax + movq G+4*N(%rsp), %rdx + xorq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $59, %rbx, %rdi + movq %rdi, G+2*N(%rsp) + shrdq $59, %rsi, %rbx + movq %rbx, G+3*N(%rsp) + sarq $59, %rsi + movq %rsi, G+4*N(%rsp) + +// Get the initial carries back from storage and do the [u,v] accumulation + + movq tmp, %rbx + movq tmp2, %rbp + +// Digit 0 of [u,v] + + xorl %ecx, %ecx + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V(%rsp) + +// Digit 1 of [u,v] + + xorl %ebx, %ebx + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+N(%rsp) + +// Digit 2 of [u,v] + + xorl %ecx, %ecx + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U+2*N(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V+2*N(%rsp) + +// Digits 3 and 4 of u (top is unsigned) + + movq U+3*N(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+3*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + +// Preload for last use of old u digit 3 + + movq U+3*N(%rsp), %rax + movq %rcx, U+3*N(%rsp) + movq %rdx, U+4*N(%rsp) + +// Digits 3 and 4 of v (top is unsigned) + + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq V+3*N(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rsi, V+3*N(%rsp) + movq %rdx, V+4*N(%rsp) + +// Montgomery reduction of u + + amontred(u) + +// Montgomery reduction of v + + amontred(v) + +bignum_inv_p256_midloop: + + divstep59(d,ff,gg) + movq %rsi, d + +// Next iteration + + decq i + jnz bignum_inv_p256_loop + +// The 10th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + movq F(%rsp), %rax + movq G(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $63, %rax + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * [u,v] (mod p_256) +// we want to flip the sign of u according to that of f. + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + +// Adjust the initial value to allow for complement instead of negation + + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + +// Digit 0 of [u] + + xorl %r13d, %r13d + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + +// Digit 1 of [u] + + xorl %r14d, %r14d + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + +// Digit 2 of [u] + + xorl %r15d, %r15d + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + +// Digits 3 and 4 of u (top is unsigned) + + movq U+3*N(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq V+3*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + +// Store back and Montgomery reduce u + + movq %r12, U(%rsp) + movq %r13, U+N(%rsp) + movq %r14, U+2*N(%rsp) + movq %r15, U+3*N(%rsp) + movq %r9, U+4*N(%rsp) + + amontred(u) + +// Perform final strict reduction mod p_256 and copy to output + + movq U(%rsp), %r8 + movq U+N(%rsp), %r9 + movq U+2*N(%rsp), %r10 + movq U+3*N(%rsp), %r11 + + movl $1, %eax + movl $0xffffffff, %ebx + leaq -2(%rax), %rcx + leaq -1(%rbx), %rdx + notq %rbx + + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovncq %r8, %rax + cmovncq %r9, %rbx + cmovncq %r10, %rcx + cmovncq %r11, %rdx + + movq res, %rdi + movq %rax, (%rdi) + movq %rbx, N(%rdi) + movq %rcx, 2*N(%rdi) + movq %rdx, 3*N(%rdi) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_littleendian_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_littleendian_4.S new file mode 100644 index 00000000000..e378441427a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_littleendian_4.S @@ -0,0 +1,74 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert 4-digit (256-bit) bignum to/from little-endian form +// Input x[4]; output z[4] +// +// extern void bignum_littleendian_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The same function is given two other prototypes whose names reflect the +// treatment of one or other argument as a byte array rather than word array: +// +// extern void bignum_fromlebytes_4 +// (uint64_t z[static 4], uint8_t x[static 32]); +// +// extern void bignum_tolebytes_4 +// (uint8_t z[static 32], uint64_t x[static 4]); +// +// Since x86 is little-endian, this is just copying. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_littleendian_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_littleendian_4) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_fromlebytes_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_fromlebytes_4) + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tolebytes_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tolebytes_4) + + .text + +#define z %rdi +#define x %rsi +#define a %rax + +S2N_BN_SYMBOL(bignum_littleendian_4): +S2N_BN_SYMBOL(bignum_fromlebytes_4): +S2N_BN_SYMBOL(bignum_tolebytes_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + + movq (x), a + movq a, (z) + + movq 8(x), a + movq a, 8(z) + + movq 16(x), a + movq a, 16(z) + + movq 24(x), a + movq a, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256.S new file mode 100644 index 00000000000..2ada6c29bbb --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256.S @@ -0,0 +1,205 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod n_256 +// Input x[k]; output z[4] +// +// extern void bignum_mod_n256 +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Reduction is modulo the group order of the NIST curve P-256. +// +// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = k, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256) + .text + +#define z %rdi +#define k %rsi +#define x %rcx + +#define m0 %r8 +#define m1 %r9 +#define m2 %r10 +#define m3 %r11 +#define d %r12 + +#define n0 %rax +#define n1 %rbx +#define n3 %rdx +#define q %rdx + +#define n0short %eax +#define n3short %edx + + +S2N_BN_SYMBOL(bignum_mod_n256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save extra registers + + pushq %rbx + pushq %r12 + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmpq $4, k + jc bignum_mod_n256_shortinput + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 + + subq $4, k + movq 24(%rdx,k,8), m3 + movq 16(%rdx,k,8), m2 + movq 8(%rdx,k,8), m1 + movq (%rdx,k,8), m0 + +// Move x into another register to leave %rdx free for multiplies and use of n3 + + movq %rdx, x + +// Reduce the top 4 digits mod n_256 (a conditional subtraction of n_256) + + movq $0x0c46353d039cdaaf, n0 + movq $0x4319055258e8617b, n1 + movl $0x00000000ffffffff, n3short + + addq n0, m0 + adcq n1, m1 + adcq $0, m2 + adcq n3, m3 + sbbq d, d + notq d + andq d, n0 + andq d, n1 + andq d, n3 + subq n0, m0 + sbbq n1, m1 + sbbq $0, m2 + sbbq n3, m3 + +// Now do (k-4) iterations of 5->4 word modular reduction + + testq k, k + jz bignum_mod_n256_writeback + +bignum_mod_n256_loop: + +// Writing the input as z = 2^256 * m3 + 2^192 * m2 + t = 2^192 * h + t, our +// intended quotient approximation is MIN ((h + h>>32 + 1)>>64) (2^64 - 1). + + movq m3, n0 + shldq $32, m2, n0 + movq m3, q + shrq $32, q + + xorq n1, n1 + subq $1, n1 + + adcq m2, n0 + adcq m3, q + sbbq n0, n0 + orq n0, q + +// Load the next digit so current m to reduce = [m3;m2;m1;m0;d] + + movq -8(x,k,8), d + +// Now form [m3;m2;m1;m0;d] = m - q * n_256 + + subq q, m3 + movq $0x0c46353d039cdaaf, n0 + mulxq n0, n0, n1 + addq n0, d + adcq n1, m0 + movq $0x4319055258e8617b, n0 + mulxq n0, n0, n1 + adcq $0, n1 + addq n0, m0 + adcq n1, m1 + movl $0x00000000ffffffff, n0short + mulxq n0, n0, n1 + adcq n0, m2 + adcq n1, m3 + +// Now our top word m3 is either zero or all 1s. Use it for a masked +// addition of n_256, which we can do by a *subtraction* of +// 2^256 - n_256 from our portion + + movq $0x0c46353d039cdaaf, n0 + andq m3, n0 + movq $0x4319055258e8617b, n1 + andq m3, n1 + movl $0x00000000ffffffff, n3short + andq m3, n3 + + subq n0, d + sbbq n1, m0 + sbbq $0, m1 + sbbq n3, m2 + +// Now shuffle registers up and loop + + movq m2, m3 + movq m1, m2 + movq m0, m1 + movq d, m0 + + decq k + jnz bignum_mod_n256_loop + +// Write back + +bignum_mod_n256_writeback: + + movq m0, (z) + movq m1, 8(z) + movq m2, 16(z) + movq m3, 24(z) + +// Restore registers and return + + popq %r12 + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +bignum_mod_n256_shortinput: + + xorq m0, m0 + xorq m1, m1 + xorq m2, m2 + xorq m3, m3 + + testq k, k + jz bignum_mod_n256_writeback + movq (%rdx), m0 + decq k + jz bignum_mod_n256_writeback + movq 8(%rdx), m1 + decq k + jz bignum_mod_n256_writeback + movq 16(%rdx), m2 + jmp bignum_mod_n256_writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256_4.S new file mode 100644 index 00000000000..2c29a4ca607 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256_4.S @@ -0,0 +1,100 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod n_256 +// Input x[4]; output z[4] +// +// extern void bignum_mod_n256_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Reduction is modulo the group order of the NIST curve P-256. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256_4) + .text + +#define z %rdi +#define x %rsi + + + +#define d0 %rdx +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n0 %rax +#define n1 %r10 +#define n3 %r11 + +#define n3short %r11d + +// Can re-use this as a temporary once we've loaded the input + +#define c %rsi + +S2N_BN_SYMBOL(bignum_mod_n256_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load a set of registers [n3; 0; n1; n0] = 2^256 - n_256 + + movq $0x0c46353d039cdaaf, n0 + movq $0x4319055258e8617b, n1 + movl $0x00000000ffffffff, n3short + +// Load the input and compute x + (2^256 - n_256) + + movq (x), d0 + addq n0, d0 + movq 8(x), d1 + adcq n1, d1 + movq 16(x), d2 + adcq $0, d2 + movq 24(x), d3 + adcq n3, d3 + +// Now CF is set iff 2^256 <= x + (2^256 - n_256), i.e. iff n_256 <= x. +// Create a mask for the condition x < n, and mask the three nontrivial digits +// ready to undo the previous addition with a compensating subtraction + + sbbq c, c + notq c + andq c, n0 + andq c, n1 + andq c, n3 + +// Now subtract mask * (2^256 - n_256) again and store + + subq n0, d0 + movq d0, (z) + sbbq n1, d1 + movq d1, 8(z) + sbbq $0, d2 + movq d2, 16(z) + sbbq n3, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256_alt.S new file mode 100644 index 00000000000..a3ef32f51fd --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_n256_alt.S @@ -0,0 +1,213 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod n_256 +// Input x[k]; output z[4] +// +// extern void bignum_mod_n256_alt +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Reduction is modulo the group order of the NIST curve P-256. +// +// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = k, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256_alt) + .text + +#define z %rdi +#define k %rsi +#define x %rcx + +#define m0 %r8 +#define m1 %r9 +#define m2 %r10 +#define m3 %r11 +#define d %r12 + +#define n0 %rax +#define n1 %rbx +#define n3 %rdx + +#define q %rbx + +#define n0short %eax +#define n3short %edx + + +S2N_BN_SYMBOL(bignum_mod_n256_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save extra registers + + pushq %rbx + pushq %r12 + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmpq $4, k + jc bignum_mod_n256_alt_shortinput + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 + + subq $4, k + movq 24(%rdx,k,8), m3 + movq 16(%rdx,k,8), m2 + movq 8(%rdx,k,8), m1 + movq (%rdx,k,8), m0 + +// Move x into another register to leave %rdx free for multiplies and use of n3 + + movq %rdx, x + +// Reduce the top 4 digits mod n_256 (a conditional subtraction of n_256) + + movq $0x0c46353d039cdaaf, n0 + movq $0x4319055258e8617b, n1 + movl $0x00000000ffffffff, n3short + + addq n0, m0 + adcq n1, m1 + adcq $0, m2 + adcq n3, m3 + sbbq d, d + notq d + andq d, n0 + andq d, n1 + andq d, n3 + subq n0, m0 + sbbq n1, m1 + sbbq $0, m2 + sbbq n3, m3 + +// Now do (k-4) iterations of 5->4 word modular reduction + + testq k, k + jz bignum_mod_n256_alt_writeback + +bignum_mod_n256_alt_loop: + +// Writing the input as z = 2^256 * m3 + 2^192 * m2 + t = 2^192 * h + t, our +// intended quotient approximation is MIN ((h + h>>32 + 1)>>64) (2^64 - 1). + + movq m3, n0 + shldq $32, m2, n0 + movq m3, q + shrq $32, q + + xorq %rdx, %rdx + subq $1, %rdx + + adcq m2, n0 + adcq m3, q + sbbq n0, n0 + orq n0, q + +// Load the next digit so current m to reduce = [m3;m2;m1;m0;d] + + movq -8(x,k,8), d + +// Now form [m3;m2;m1;m0;d] = m - q * n_256 + + subq q, m3 + + movq $0x0c46353d039cdaaf, %rax + mulq q + addq %rax, d + adcq %rdx, m0 + adcq $0, m1 + adcq $0, m2 + adcq $0, m3 + + movq $0x4319055258e8617b, %rax + mulq q + addq %rax, m0 + adcq %rdx, m1 + adcq $0, m2 + adcq $0, m3 + + movq $0x00000000ffffffff, %rax + mulq q + addq %rax, m2 + adcq %rdx, m3 + +// Now our top word m3 is either zero or all 1s. Use it for a masked +// addition of n_256, which we can do by a *subtraction* of +// 2^256 - n_256 from our portion + + movq $0x0c46353d039cdaaf, n0 + andq m3, n0 + movq $0x4319055258e8617b, n1 + andq m3, n1 + movl $0x00000000ffffffff, n3short + andq m3, n3 + + subq n0, d + sbbq n1, m0 + sbbq $0, m1 + sbbq n3, m2 + +// Now shuffle registers up and loop + + movq m2, m3 + movq m1, m2 + movq m0, m1 + movq d, m0 + + decq k + jnz bignum_mod_n256_alt_loop + +// Write back + +bignum_mod_n256_alt_writeback: + + movq m0, (z) + movq m1, 8(z) + movq m2, 16(z) + movq m3, 24(z) + +// Restore registers and return + + popq %r12 + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +bignum_mod_n256_alt_shortinput: + + xorq m0, m0 + xorq m1, m1 + xorq m2, m2 + xorq m3, m3 + + testq k, k + jz bignum_mod_n256_alt_writeback + movq (%rdx), m0 + decq k + jz bignum_mod_n256_alt_writeback + movq 8(%rdx), m1 + decq k + jz bignum_mod_n256_alt_writeback + movq 16(%rdx), m2 + jmp bignum_mod_n256_alt_writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256.S new file mode 100644 index 00000000000..19576bc4a51 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256.S @@ -0,0 +1,198 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_256 +// Input x[k]; output z[4] +// +// extern void bignum_mod_p256 +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = k, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256) + .text + +#define z %rdi +#define k %rsi +#define x %rcx + +#define m0 %r8 +#define m1 %r9 +#define m2 %r10 +#define m3 %r11 +#define d %r12 + +#define n0 %rax +#define n1 %rbx +#define n3 %rdx +#define q %rdx + +#define n0short %eax +#define n1short %ebx + + +S2N_BN_SYMBOL(bignum_mod_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save extra registers + + pushq %rbx + pushq %r12 + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmpq $4, k + jc bignum_mod_p256_shortinput + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 + + subq $4, k + movq 24(%rdx,k,8), m3 + movq 16(%rdx,k,8), m2 + movq 8(%rdx,k,8), m1 + movq (%rdx,k,8), m0 + +// Move x into another register to leave %rdx free for multiplies and use of n3 + + movq %rdx, x + +// Load non-trivial digits [n3; 0; n1; -1] = p_256 and do a conditional +// subtraction to reduce the four starting digits [m3;m2;m1;m0] modulo p_256 + + subq $-1, m0 + movl $0x00000000ffffffff, n1short + sbbq n1, m1 + movq $0xffffffff00000001, n3 + sbbq $0, m2 + sbbq n3, m3 + + sbbq n0, n0 + + andq n0, n1 + andq n0, n3 + addq n0, m0 + adcq n1, m1 + adcq $0, m2 + adcq n3, m3 + +// Now do (k-4) iterations of 5->4 word modular reduction + + testq k, k + jz bignum_mod_p256_writeback + +bignum_mod_p256_loop: + +// Writing the input as z = 2^256 * m3 + 2^192 * m2 + t = 2^192 * h + t, our +// intended quotient approximation is MIN ((h + h>>32 + 1)>>64) (2^64 - 1). + + movq m3, n0 + shldq $32, m2, n0 + movq m3, q + shrq $32, q + + xorq n1, n1 + subq $1, n1 + + adcq m2, n0 + adcq m3, q + sbbq n0, n0 + orq n0, q + +// Load the next digit so current m to reduce = [m3;m2;m1;m0;d] + + movq -8(x,k,8), d + +// Now compute the initial pre-reduced [m3;m2;m1;m0;d] = m - p_256 * q +// = z - (2^256 - 2^224 + 2^192 + 2^96 - 1) * q +// = z - 2^192 * 0xffffffff00000001 * q - 2^64 * 0x0000000100000000 * q + q + + addq q, d + movq $0x0000000100000000, n0 + mulxq n0, n0, n1 + sbbq $0, n0 + sbbq $0, n1 + subq n0, m0 + sbbq n1, m1 + movq $0xffffffff00000001, n0 + mulxq n0, n0, n1 + sbbq n0, m2 + sbbq n1, m3 + +// Now our top word m3 is either zero or all 1s, and we use this to discriminate +// whether a correction is needed because our result is negative, as a bitmask +// Do a masked addition of p_256 + + movl $0x00000000ffffffff, n0short + andq m3, n0 + xorq n1, n1 + subq n0, n1 + addq m3, d + adcq n0, m0 + adcq $0, m1 + adcq n1, m2 + +// Shuffle registers up and loop + + movq m2, m3 + movq m1, m2 + movq m0, m1 + movq d, m0 + + decq k + jnz bignum_mod_p256_loop + +// Write back + +bignum_mod_p256_writeback: + + movq m0, (z) + movq m1, 8(z) + movq m2, 16(z) + movq m3, 24(z) + +// Restore registers and return + + popq %r12 + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +bignum_mod_p256_shortinput: + + xorq m0, m0 + xorq m1, m1 + xorq m2, m2 + xorq m3, m3 + + testq k, k + jz bignum_mod_p256_writeback + movq (%rdx), m0 + decq k + jz bignum_mod_p256_writeback + movq 8(%rdx), m1 + decq k + jz bignum_mod_p256_writeback + movq 16(%rdx), m2 + jmp bignum_mod_p256_writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256_4.S new file mode 100644 index 00000000000..f87013791fb --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256_4.S @@ -0,0 +1,88 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_mod_p256_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256_4) + .text + +#define z %rdi +#define x %rsi + +#define d0 %rdx +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n1 %r10 +#define n3 %r11 +#define c %rax + +#define n1short %r10d + + + +S2N_BN_SYMBOL(bignum_mod_p256_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the input and subtract to get [d3;d3;d1;d1] = x - p_256 (modulo 2^256) +// The constants n1 and n3 in [n3; 0; n1; -1] = p_256 are saved for later + + movq (x), d0 + subq $-1, d0 + movq 8(x), d1 + movl $0x00000000ffffffff, n1short + sbbq n1, d1 + movq 16(x), d2 + sbbq $0, d2 + movq $0xffffffff00000001, n3 + movq 24(x), d3 + sbbq n3, d3 + +// Capture the carry to determine whether to add back p_256, and use +// it to create a masked p_256' = [n3; 0; n1; c] + + sbbq c, c + andq c, n1 + andq c, n3 + +// Do the corrective addition and copy to output + + addq c, d0 + movq d0, (z) + adcq n1, d1 + movq d1, 8(z) + adcq $0, d2 + movq d2, 16(z) + adcq n3, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256_alt.S new file mode 100644 index 00000000000..7ae9566d4ec --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mod_p256_alt.S @@ -0,0 +1,202 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_256 +// Input x[k]; output z[4] +// +// extern void bignum_mod_p256_alt +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = k, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256_alt) + .text + +#define z %rdi +#define k %rsi +#define x %rcx + +#define m0 %r8 +#define m1 %r9 +#define m2 %r10 +#define m3 %r11 +#define d %r12 + +#define n0 %rax + +#define n1 %rbx +#define q %rbx + +#define n3 %rdx + +#define n0short %eax +#define n1short %ebx + + +S2N_BN_SYMBOL(bignum_mod_p256_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save extra registers + + pushq %rbx + pushq %r12 + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmpq $4, k + jc bignum_mod_p256_alt_shortinput + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 + + subq $4, k + movq 24(%rdx,k,8), m3 + movq 16(%rdx,k,8), m2 + movq 8(%rdx,k,8), m1 + movq (%rdx,k,8), m0 + +// Move x into another register to leave %rdx free for multiplies and use of n3 + + movq %rdx, x + +// Load non-trivial digits [n3; 0; n1; -1] = p_256 and do a conditional +// subtraction to reduce the four starting digits [m3;m2;m1;m0] modulo p_256 + + subq $-1, m0 + movl $0x00000000ffffffff, n1short + sbbq n1, m1 + movq $0xffffffff00000001, n3 + sbbq $0, m2 + sbbq n3, m3 + + sbbq n0, n0 + + andq n0, n1 + andq n0, n3 + addq n0, m0 + adcq n1, m1 + adcq $0, m2 + adcq n3, m3 + +// Now do (k-4) iterations of 5->4 word modular reduction + + testq k, k + jz bignum_mod_p256_alt_writeback + +bignum_mod_p256_alt_loop: + +// Writing the input as z = 2^256 * m3 + 2^192 * m2 + t = 2^192 * h + t, our +// intended quotient approximation is MIN ((h + h>>32 + 1)>>64) (2^64 - 1). + + movq m3, n0 + shldq $32, m2, n0 + movq m3, q + shrq $32, q + + xorq n3, n3 + subq $1, n3 + + adcq m2, n0 + adcq m3, q + sbbq n0, n0 + orq n0, q + +// Load the next digit so current m to reduce = [m3;m2;m1;m0;d] + + movq -8(x,k,8), d + +// Now compute the initial pre-reduced [m3;m2;m1;m0;d] = m - p_256 * q +// = z - (2^256 - 2^224 + 2^192 + 2^96 - 1) * q +// = z - 2^192 * 0xffffffff00000001 * q - 2^64 * 0x0000000100000000 * q + q + + movq $0x0000000100000000, %rax + mulq q + addq q, d + sbbq $0, %rax + sbbq $0, %rdx + subq %rax, m0 + sbbq %rdx, m1 + sbbq $0, m2 + sbbq $0, m3 + movq $0xffffffff00000001, %rax + mulq q + subq %rax, m2 + sbbq %rdx, m3 + +// Now our top word m3 is either zero or all 1s, and we use this to discriminate +// whether a correction is needed because our result is negative, as a bitmask +// Do a masked addition of p_256 + + movl $0x00000000ffffffff, n0short + andq m3, n0 + xorq n1, n1 + subq n0, n1 + addq m3, d + adcq n0, m0 + adcq $0, m1 + adcq n1, m2 + +// Shuffle registers up and loop + + movq m2, m3 + movq m1, m2 + movq m0, m1 + movq d, m0 + + decq k + jnz bignum_mod_p256_alt_loop + +// Write back + +bignum_mod_p256_alt_writeback: + + movq m0, (z) + movq m1, 8(z) + movq m2, 16(z) + movq m3, 24(z) + +// Restore registers and return + + popq %r12 + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +bignum_mod_p256_alt_shortinput: + + xorq m0, m0 + xorq m1, m1 + xorq m2, m2 + xorq m3, m3 + + testq k, k + jz bignum_mod_p256_alt_writeback + movq (%rdx), m0 + decq k + jz bignum_mod_p256_alt_writeback + movq 8(%rdx), m1 + decq k + jz bignum_mod_p256_alt_writeback + movq 16(%rdx), m2 + jmp bignum_mod_p256_alt_writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montinv_p256.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montinv_p256.S index 36f5d376e0c..b595645db9f 100644 --- a/third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montinv_p256.S @@ -1017,6 +1017,7 @@ leaq (%rax,%rdx), %r12 S2N_BN_SYMBOL(bignum_montinv_p256): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montmul_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montmul_p256.S new file mode 100644 index 00000000000..5267c29f3b8 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montmul_p256.S @@ -0,0 +1,191 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_256 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_p256 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_256, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_256 (in particular this is true if we are in +// the "usual" case x < p_256 and y < p_256). +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256) + .text + +#define z %rdi +#define x %rsi + +// We move the y argument here so we can use %rdx for multipliers + +#define y %rcx + +// Add %rdx * m into a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rbx as temporaries + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rbx ; \ + adcxq %rax, low ; \ + adoxq %rbx, high + +S2N_BN_SYMBOL(bignum_montmul_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Copy y into a safe register to start with + + movq %rdx, y + +// Do row 0 computation, which is a bit different: +// set up initial window [%r12,%r11,%r10,%r9,%r8] = y[0] * x +// Unlike later, we only need a single carry chain + + xorl %r13d, %r13d + movq (y), %rdx + mulxq (x), %r8, %r9 + mulxq 8(x), %rbx, %r10 + adcq %rbx, %r9 + mulxq 16(x), %rbx, %r11 + adcq %rbx, %r10 + mulxq 24(x), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + +// Add row 1 + + movq 8(y), %rdx + xorl %r14d, %r14d + mulpadd(%r10,%r9,(x)) + mulpadd(%r11,%r10,8(x)) + mulpadd(%r12,%r11,16(x)) + mulpadd(%r13,%r12,24(x)) + adcq %r14, %r13 + +// Montgomery reduce windows 0 and 1 together + + xorl %r15d, %r15d + movq $0x0000000100000000, %rdx + mulpadd(%r10,%r9,%r8) + mulpadd(%r11,%r10,%r9) + notq %rdx + leaq 2(%rdx), %rdx + mulpadd(%r12,%r11,%r8) + mulpadd(%r13,%r12,%r9) + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + +// Add row 2 + + movq 16(y), %rdx + xorl %r8d, %r8d + mulpadd(%r11,%r10,(x)) + mulpadd(%r12,%r11,8(x)) + mulpadd(%r13,%r12,16(x)) + adoxq %r8, %r14 + mulxq 24(x), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + +// Add row 3 + + movq 24(y), %rdx + xorl %r9d, %r9d + mulpadd(%r12,%r11,(x)) + mulpadd(%r13,%r12,8(x)) + mulpadd(%r14,%r13,16(x)) + adoxq %r9, %r15 + mulxq 24(x), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + +// Montgomery reduce windows 2 and 3 together + + xorl %r9d, %r9d + movq $0x0000000100000000, %rdx + mulpadd(%r12,%r11,%r10) + mulpadd(%r13,%r12,%r11) + notq %rdx + leaq 2(%rdx), %rdx + mulpadd(%r14,%r13,%r10) + mulpadd(%r15,%r14,%r11) + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + +// We now have a pre-reduced 5-word form [%r8; %r15;%r14;%r13;%r12] +// Load [%rax;%r11;%rbx;%rdx;%rcx] = 2^320 - p_256, re-using earlier numbers a bit +// Do [%rax;%r11;%rbx;%rdx;%rcx] = [%r8;%r15;%r14;%r13;%r12] + (2^320 - p_256) + + movl $1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0x00000000fffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + +// Now carry is set if r + (2^320 - p_256) >= 2^320, i.e. r >= p_256 +// where r is the pre-reduced form. So conditionally select the +// output accordingly. + + cmovcq %rcx, %r12 + cmovcq %rdx, %r13 + cmovcq %r9, %r14 + cmovcq %r11, %r15 + +// Write back reduced value + + movq %r12, (z) + movq %r13, 8(z) + movq %r14, 16(z) + movq %r15, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montmul_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montmul_p256_alt.S new file mode 100644 index 00000000000..9161da2cdb7 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montmul_p256_alt.S @@ -0,0 +1,214 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_256 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_p256_alt +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_256, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_256 (in particular this is true if we are in +// the "usual" case x < p_256 and y < p_256). +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256_alt) + .text + +#define z %rdi +#define x %rsi + +// We move the y argument here so we can use %rdx for multipliers + +#define y %rcx + +// Add %rbx * m into a register-pair (high,low) maintaining consistent +// carry-catching with carry (negated, as bitmask) and using %rax and %rdx +// as temporaries + +#define mulpadd(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rbx; \ + subq carry, %rdx ; \ + addq %rax, low ; \ + adcq %rdx, high ; \ + sbbq carry, carry + +// Initial version assuming no carry-in + +#define mulpadi(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rbx; \ + addq %rax, low ; \ + adcq %rdx, high ; \ + sbbq carry, carry + +// End version not catching the top carry-out + +#define mulpade(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rbx; \ + subq carry, %rdx ; \ + addq %rax, low ; \ + adcq %rdx, high + +S2N_BN_SYMBOL(bignum_montmul_p256_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Copy y into a safe register to start with + + movq %rdx, y + +// Do row 0 computation, which is a bit different: +// set up initial window [%r12,%r11,%r10,%r9,%r8] = y[0] * x +// Unlike later, we only need a single carry chain + + movq (y), %rbx + movq (x), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + + movq 8(x), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + + movq 16(x), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + + movq 24(x), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + +// Add row 1 + + movq 8(y), %rbx + xorl %r13d, %r13d + mulpadi(%r14,%r10,%r9,(x)) + mulpadd(%r14,%r11,%r10,8(x)) + mulpadd(%r14,%r12,%r11,16(x)) + mulpade(%r14,%r13,%r12,24(x)) + +// Montgomery reduce windows 0 and 1 together + + xorl %r14d, %r14d + movq $0x0000000100000000, %rbx + mulpadi(%r15,%r10,%r9,%r8) + mulpadd(%r15,%r11,%r10,%r9) + notq %rbx + leaq 2(%rbx), %rbx + mulpadd(%r15,%r12,%r11,%r8) + mulpade(%r15,%r13,%r12,%r9) + adcq %r14, %r14 + +// Add row 2 + + movq 16(y), %rbx + xorl %r15d, %r15d + mulpadi(%r8,%r11,%r10,(x)) + mulpadd(%r8,%r12,%r11,8(x)) + mulpadd(%r8,%r13,%r12,16(x)) + mulpade(%r8,%r14,%r13,24(x)) + adcq %r15, %r15 + +// Add row 3 + + movq 24(y), %rbx + xorl %r8d, %r8d + mulpadi(%r9,%r12,%r11,(x)) + mulpadd(%r9,%r13,%r12,8(x)) + mulpadd(%r9,%r14,%r13,16(x)) + mulpade(%r9,%r15,%r14,24(x)) + adcq %r8, %r8 + +// Montgomery reduce windows 2 and 3 together + + xorl %r9d, %r9d + movq $0x0000000100000000, %rbx + mulpadi(%rcx,%r12,%r11,%r10) + mulpadd(%rcx,%r13,%r12,%r11) + notq %rbx + leaq 2(%rbx), %rbx + mulpadd(%rcx,%r14,%r13,%r10) + mulpade(%rcx,%r15,%r14,%r11) + adcq %r9, %r8 + +// We now have a pre-reduced 5-word form [%r8; %r15;%r14;%r13;%r12] +// Load [%rax;%r11;%r9;%rbx;%rcx] = 2^320 - p_256, re-using earlier numbers a bit +// Do [%rax;%r11;%r9;%rbx;%rcx] = [%r8;%r15;%r14;%r13;%r12] + (2^320 - p_256) + + movl $1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0x00000000fffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + +// Now carry is set if r + (2^320 - p_256) >= 2^320, i.e. r >= p_256 +// where r is the pre-reduced form. So conditionally select the +// output accordingly. + + cmovcq %rcx, %r12 + cmovcq %rbx, %r13 + cmovcq %r9, %r14 + cmovcq %r11, %r15 + +// Write back reduced value + + movq %r12, (z) + movq %r13, 8(z) + movq %r14, 16(z) + movq %r15, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montsqr_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montsqr_p256.S new file mode 100644 index 00000000000..ca2c79c9997 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montsqr_p256.S @@ -0,0 +1,189 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_256, assuming x^2 <= 2^256 * p_256, which is +// guaranteed in particular if x < p_256 initially (the "intended" case). +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256) + .text + +#define z %rdi +#define x %rsi + +// Use this fairly consistently for a zero + +#define zero %rbp +#define zeroe %ebp + +// Add %rdx * m into a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rbx as temporaries + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rbx ; \ + adcxq %rax, low ; \ + adoxq %rbx, high + +S2N_BN_SYMBOL(bignum_montsqr_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Compute [%r15;%r8] = [00] which we use later, but mainly +// set up an initial window [%r14;...;%r9] = [23;03;01] + + movq (x), %rdx + mulxq %rdx, %r8, %r15 + mulxq 8(x), %r9, %r10 + mulxq 24(x), %r11, %r12 + movq 16(x), %rdx + mulxq 24(x), %r13, %r14 + +// Clear our zero register, and also initialize the flags for the carry chain + + xorl zeroe, zeroe + +// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible) +// This gives all the "heterogeneous" terms of the squaring ready to double + + mulpadd(%r11,%r10,(x)) + mulpadd(%r12,%r11,8(x)) + movq 24(x), %rdx + mulpadd(%r13,%r12,8(x)) + adcxq zero, %r13 + adoxq zero, %r14 + adcq zero, %r14 + +// Double and add to the 00 + 11 + 22 + 33 terms + + xorl zeroe, zeroe + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 8(x), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 16(x), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 24(x), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq zero, %r15 + adoxq zero, %r15 + +// First two waves of Montgomery reduction. Consolidate the double carries +// in %r9 and propagate up to the top in %r8, which is no longer needed otherwise. + + xorl zeroe, zeroe + movq $0x0000000100000000, %rdx + mulpadd(%r10,%r9,%r8) + mulpadd(%r11,%r10,%r9) + movq $0xffffffff00000001, %rdx + mulpadd(%r12,%r11,%r8) + mulpadd(%r13,%r12,%r9) + adcxq zero, %r13 + movl zeroe, %r9d + adoxq zero, %r9 + adcxq zero, %r9 + addq %r9, %r14 + adcq zero, %r15 + movl zeroe, %r8d + adcq zero, %r8 + +// Now two more steps of Montgomery reduction, again with %r8 = top carry + + xorl zeroe, zeroe + movq $0x0000000100000000, %rdx + mulpadd(%r12,%r11,%r10) + mulpadd(%r13,%r12,%r11) + movq $0xffffffff00000001, %rdx + mulpadd(%r14,%r13,%r10) + mulpadd(%r15,%r14,%r11) + adcxq zero, %r15 + adoxq zero, %r8 + adcq zero, %r8 + +// Load [%rax;%r11;%rbp;%rdx;%rcx] = 2^320 - p_256, re-using earlier numbers a bit +// Do [%rax;%r11;%rbp;%rdx;%rcx] = [%r8;%r15;%r14;%r13;%r12] + (2^320 - p_256) + + movl $1, %ecx + addq %r12, %rcx + leaq -1(%rdx), %rdx + adcq %r13, %rdx + leaq -1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0x00000000fffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + +// Now carry is set if r + (2^320 - p_256) >= 2^320, i.e. r >= p_256 +// where r is the pre-reduced form. So conditionally select the +// output accordingly. + + cmovcq %rcx, %r12 + cmovcq %rdx, %r13 + cmovcq %rbp, %r14 + cmovcq %r11, %r15 + +// Write back reduced value + + movq %r12, (z) + movq %r13, 8(z) + movq %r14, 16(z) + movq %r15, 24(z) + +// Restore saved registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montsqr_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montsqr_p256_alt.S new file mode 100644 index 00000000000..688560cc56f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_montsqr_p256_alt.S @@ -0,0 +1,212 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_p256_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_256, assuming x^2 <= 2^256 * p_256, which is +// guaranteed in particular if x < p_256 initially (the "intended" case). +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256_alt) + .text + +#define z %rdi +#define x %rsi + +// Add %rbx * m into a register-pair (high,low) maintaining consistent +// carry-catching with carry (negated, as bitmask) and using %rax and %rdx +// as temporaries + +#define mulpadd(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rbx; \ + subq carry, %rdx ; \ + addq %rax, low ; \ + adcq %rdx, high ; \ + sbbq carry, carry + +// Initial version assuming no carry-in + +#define mulpadi(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rbx; \ + addq %rax, low ; \ + adcq %rdx, high ; \ + sbbq carry, carry + +// End version not catching the top carry-out + +#define mulpade(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rbx; \ + subq carry, %rdx ; \ + addq %rax, low ; \ + adcq %rdx, high + +S2N_BN_SYMBOL(bignum_montsqr_p256_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Compute [%r15;%r8] = [00] which we use later, but mainly +// set up an initial window [%r14;...;%r9] = [23;03;01] + + movq (x), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 8(x), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 24(x), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 16(x), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + +// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible) +// This gives all the "heterogeneous" terms of the squaring ready to double + + mulpadi(%rcx,%r11,%r10,(x)) + mulpadd(%rcx,%r12,%r11,8(x)) + movq 24(x), %rbx + mulpade(%rcx,%r13,%r12,8(x)) + adcq $0, %r14 + +// Double the window [%r14;...;%r9], catching top carry in %rcx + + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + +// Add to the 00 + 11 + 22 + 33 terms + + movq 8(x), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 16(x), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 24(x), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + +// First two waves of Montgomery reduction, now re-using %r8 for top carry + + movq $0x0000000100000000, %rbx + mulpadi(%rcx,%r10,%r9,%r8) + mulpadd(%rcx,%r11,%r10,%r9) + notq %rbx + leaq 2(%rbx), %rbx + mulpadd(%rcx,%r12,%r11,%r8) + xorl %r8d, %r8d + mulpade(%rcx,%r13,%r12,%r9) + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + +// Now two more steps of Montgomery reduction, again with %r8 = top carry + + movq $0x0000000100000000, %rbx + mulpadi(%rcx,%r12,%r11,%r10) + mulpadd(%rcx,%r13,%r12,%r11) + notq %rbx + leaq 2(%rbx), %rbx + mulpadd(%rcx,%r14,%r13,%r10) + xorl %r9d, %r9d + mulpade(%rcx,%r15,%r14,%r11) + adcq %r9, %r8 + +// Load [%rax;%r11;%r9;%rbx;%rcx] = 2^320 - p_256, re-using earlier numbers a bit +// Do [%rax;%r11;%r9;%rbx;%rcx] = [%r8;%r15;%r14;%r13;%r12] + (2^320 - p_256) + + movl $1, %ecx + addq %r12, %rcx + leaq -1(%rbx), %rbx + adcq %r13, %rbx + leaq -1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0x00000000fffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + +// Now carry is set if r + (2^320 - p_256) >= 2^320, i.e. r >= p_256 +// where r is the pre-reduced form. So conditionally select the +// output accordingly. + + cmovcq %rcx, %r12 + cmovcq %rbx, %r13 + cmovcq %r9, %r14 + cmovcq %r11, %r15 + +// Write back reduced value + + movq %r12, (z) + movq %r13, 8(z) + movq %r14, 16(z) + movq %r15, 24(z) + +// Restore saved registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mux_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mux_4.S new file mode 100644 index 00000000000..709b6d4998f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_mux_4.S @@ -0,0 +1,74 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// 256-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) +// Inputs p, x[4], y[4]; output z[4] +// +// extern void bignum_mux_4 +// (uint64_t p, uint64_t z[static 4], +// uint64_t x[static 4], uint64_t y[static 4]); +// +// It is assumed that all numbers x, y and z have the same size 4 digits. +// +// Standard x86-64 ABI: RDI = p, RSI = z, RDX = x, RCX = y +// Microsoft x64 ABI: RCX = p, RDX = z, R8 = x, R9 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mux_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mux_4) + .text + +#define p %rdi +#define z %rsi +#define x %rdx +#define y %rcx +#define a %rax +#define b %r8 + + +S2N_BN_SYMBOL(bignum_mux_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + testq p, p + + movq (x), a + movq (y), b + cmovzq b, a + movq a, (z) + + movq 8(x), a + movq 8(y), b + cmovzq b, a + movq a, 8(z) + + movq 16(x), a + movq 16(y), b + cmovzq b, a + movq a, 16(z) + + movq 24(x), a + movq 24(y), b + cmovzq b, a + movq a, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_neg_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_neg_p256.S new file mode 100644 index 00000000000..d5ddab107b9 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_neg_p256.S @@ -0,0 +1,93 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Negate modulo p_256, z := (-x) mod p_256, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_neg_p256 (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_neg_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_neg_p256) + .text + +#define z %rdi +#define x %rsi + +#define q %rdx + +#define d0 %rax +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n1 %r10 +#define n3 %r11 + +#define d0short %eax +#define n1short %r10d + +S2N_BN_SYMBOL(bignum_neg_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the input digits as [d3;d2;d1;d0] and also set a bitmask q +// for the input being nonzero, so that we avoid doing -0 = p_256 +// and hence maintain strict modular reduction + + movq (x), d0 + movq 8(x), d1 + movq d0, n1 + orq d1, n1 + movq 16(x), d2 + movq 24(x), d3 + movq d2, n3 + orq d3, n3 + orq n1, n3 + negq n3 + sbbq q, q + +// Load the non-trivial words of p_256 = [n3;0;n1;-1] and mask them with q + + movl $0x00000000ffffffff, n1short + movq $0xffffffff00000001, n3 + andq q, n1 + andq q, n3 + +// Do the subtraction, getting it as [n3;d0;n1;q] to avoid moves + + subq d0, q + movl $0, d0short + sbbq d1, n1 + sbbq d2, d0 + sbbq d3, n3 + +// Write back + + movq q, (z) + movq n1, 8(z) + movq d0, 16(z) + movq n3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_nonzero_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_nonzero_4.S new file mode 100644 index 00000000000..0daab9a2177 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_nonzero_4.S @@ -0,0 +1,58 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// 256-bit nonzeroness test, returning 1 if x is nonzero, 0 if x is zero +// Input x[4]; output function return +// +// extern uint64_t bignum_nonzero_4(uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = x, returns RAX +// Microsoft x64 ABI: RCX = x, returns RAX +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_nonzero_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_nonzero_4) + .text + +#define x %rdi +#define a %rax +#define d %rdx +#define dshort %edx + + + +S2N_BN_SYMBOL(bignum_nonzero_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi +#endif + +// Generate a = an OR of all the words in the bignum + + movq (x), a + movq 8(x), d + orq 16(x), a + orq 24(x), d + orq d, a + +// Set a standard C condition based on whether a is nonzero + + movl $1, dshort + cmovnzq d, a + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_optneg_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_optneg_p256.S new file mode 100644 index 00000000000..91a1b95f697 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_optneg_p256.S @@ -0,0 +1,102 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally negate modulo p_256, z := (-x) mod p_256 (if p nonzero) or +// z := x (if p zero), assuming x reduced +// Inputs p, x[4]; output z[4] +// +// extern void bignum_optneg_p256 +// (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = p, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = p, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_p256) + .text + +#define z %rdi +#define q %rsi +#define x %rdx + +#define n0 %rax +#define n1 %rcx +#define n2 %r8 +#define n3 %r9 + +#define n1short %ecx + +S2N_BN_SYMBOL(bignum_optneg_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Adjust q by zeroing it if the input is zero (to avoid giving -0 = p_256, +// which is not strictly reduced even though it's correct modulo p_256). +// This step is redundant if we know a priori that the input is nonzero, which +// is the case for the y coordinate of points on the P-256 curve, for example. + + movq (x), n0 + orq 8(x), n0 + movq 16(x), n1 + orq 24(x), n1 + orq n1, n0 + negq n0 + sbbq n0, n0 + andq n0, q + +// Turn q into a bitmask, all 1s for q=false, all 0s for q=true + + negq q + sbbq q, q + notq q + +// Let [n3;n2;n1;n0] = if q then p_256 else -1 + + movq $0xffffffffffffffff, n0 + movl $0x00000000ffffffff, n1short + orq q, n1 + movq q, n2 + movq $0xffffffff00000001, n3 + orq q, n3 + +// Subtract so [n3;n2;n1;n0] = if q then p_256 - x else -1 - x + + subq (x), n0 + sbbq 8(x), n1 + sbbq 16(x), n2 + sbbq 24(x), n3 + +// XOR the words with the bitmask, which in the case q = false has the +// effect of restoring ~(-1 - x) = -(-1 - x) - 1 = 1 + x - 1 = x +// and write back the digits to the output + + xorq q, n0 + movq n0, (z) + xorq q, n1 + movq n1, 8(z) + xorq q, n2 + movq n2, 16(z) + xorq q, n3 + movq n3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_sub_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_sub_p256.S new file mode 100644 index 00000000000..3cccec875ed --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_sub_p256.S @@ -0,0 +1,89 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Subtract modulo p_256, z := (x - y) mod p_256 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_sub_p256 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_p256) + .text + +#define z %rdi +#define x %rsi +#define y %rdx + +#define d0 %rax +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n1 %r10 +#define n3 %rdx +#define c %r11 + +#define n1short %r10d + + + +S2N_BN_SYMBOL(bignum_sub_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Load and subtract the two inputs as [d3;d2;d1;d0] = x - y (modulo 2^256) + + movq (x), d0 + subq (y), d0 + movq 8(x), d1 + sbbq 8(y), d1 + movq 16(x), d2 + sbbq 16(y), d2 + movq 24(x), d3 + sbbq 24(y), d3 + +// Capture the carry, which indicates x < y, and create corresponding masked +// correction p_256' = [n3; 0; n1; c] to add + + movl $0x00000000ffffffff, n1short + sbbq c, c + xorq n3, n3 + andq c, n1 + subq n1, n3 + +// Do the corrective addition and copy to output + + addq c, d0 + movq d0, (z) + adcq n1, d1 + movq d1, 8(z) + adcq $0, d2 + movq d2, 16(z) + adcq n3, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_tomont_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_tomont_p256.S new file mode 100644 index 00000000000..f748a82d27a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_tomont_p256.S @@ -0,0 +1,191 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert to Montgomery form z := (2^256 * x) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_tomont_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256) + .text + +#define z %rdi +#define x %rsi + +// Some temp registers for the last correction stage + +#define d %rax +#define u %rdx +#define v %rcx + +#define dshort %eax +#define ushort %edx + +// Add %rdx * m into a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rbx as temporaries + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rcx ; \ + adcxq %rax, low ; \ + adoxq %rcx, high + +S2N_BN_SYMBOL(bignum_tomont_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// We are essentially just doing a Montgomery multiplication of x and the +// precomputed constant y = 2^512 mod p, so the code is almost the same +// modulo a few registers and the change from loading y[i] to using constants. +// Because there is no y pointer to keep, we use one register less. + + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Do row 0 computation, which is a bit different: +// set up initial window [%r12,%r11,%r10,%r9,%r8] = y[0] * x +// Unlike later, we only need a single carry chain + + xorq %r13, %r13 + movl $0x0000000000000003, %edx + mulxq (x), %r8, %r9 + mulxq 8(x), %rcx, %r10 + adcxq %rcx, %r9 + mulxq 16(x), %rcx, %r11 + adcxq %rcx, %r10 + mulxq 24(x), %rcx, %r12 + adcxq %rcx, %r11 + adcxq %r13, %r12 + +// Add row 1 + + movq $0xfffffffbffffffff, %rdx + xorq %r14, %r14 + mulpadd(%r10,%r9,(x)) + mulpadd(%r11,%r10,8(x)) + mulpadd(%r12,%r11,16(x)) + mulpadd(%r13,%r12,24(x)) + adcq %r14, %r13 + +// Montgomery reduce windows 0 and 1 together + + xorq %r15, %r15 + movq $0x0000000100000000, %rdx + mulpadd(%r10,%r9,%r8) + mulpadd(%r11,%r10,%r9) + movq $0xffffffff00000001, %rdx + mulpadd(%r12,%r11,%r8) + mulpadd(%r13,%r12,%r9) + adcxq %r15, %r13 + adoxq %r15, %r14 + adcxq %r15, %r14 + +// Add row 2 + + movq $0xfffffffffffffffe, %rdx + xorq %r8, %r8 + mulpadd(%r11,%r10,(x)) + mulpadd(%r12,%r11,8(x)) + mulpadd(%r13,%r12,16(x)) + mulpadd(%r14,%r13,24(x)) + adcxq %r8, %r14 + adoxq %r8, %r15 + adcxq %r8, %r15 + +// Add row 3 + + movq $0x00000004fffffffd, %rdx + xorq %r9, %r9 + mulpadd(%r12,%r11,(x)) + mulpadd(%r13,%r12,8(x)) + mulpadd(%r14,%r13,16(x)) + mulpadd(%r15,%r14,24(x)) + adcxq %r9, %r15 + adoxq %r9, %r8 + adcxq %r9, %r8 + +// Montgomery reduce windows 2 and 3 together + + xorq %r9, %r9 + movq $0x0000000100000000, %rdx + mulpadd(%r12,%r11,%r10) + mulpadd(%r13,%r12,%r11) + movq $0xffffffff00000001, %rdx + mulpadd(%r14,%r13,%r10) + mulpadd(%r15,%r14,%r11) + adcxq %r9, %r15 + adoxq %r9, %r8 + adcxq %r9, %r8 + +// We now have a pre-reduced 5-word form [%r8; %r15;%r14;%r13;%r12] +// Load non-trivial digits of p_256 = [v; 0; u; -1] + + movl $0x00000000ffffffff, ushort + movq $0xffffffff00000001, v + +// Now do the subtraction (0,p_256-1) - (%r8,%r15,%r14,%r13,%r12) to get the carry + + movq $-2, d + subq %r12, d + movq u, d + sbbq %r13, d + movl $0, dshort + sbbq %r14, d + movq v, d + sbbq %r15, d + +// This last last comparison in the chain will actually even set the mask +// for us, so we don't need to separately create it from the carry. +// This means p_256 - 1 < (c,d1,d0,d5,d4), i.e. we are so far >= p_256 + + movl $0, dshort + sbbq %r8, d + andq d, u + andq d, v + +// Do a masked subtraction of p_256 and write back + + subq d, %r12 + sbbq u, %r13 + sbbq $0, %r14 + sbbq v, %r15 + + movq %r12, (z) + movq %r13, 8(z) + movq %r14, 16(z) + movq %r15, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_tomont_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_tomont_p256_alt.S new file mode 100644 index 00000000000..15a10edc19c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_tomont_p256_alt.S @@ -0,0 +1,199 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert to Montgomery form z := (2^256 * x) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_tomont_p256_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256_alt) + .text + +#define z %rdi +#define x %rsi + +// Add %rcx * m into a register-pair (high,low) maintaining consistent +// carry-catching with carry (negated, as bitmask) and using %rax and %rdx +// as temporaries + +#define mulpadd(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rcx; \ + subq carry, %rdx ; \ + addq %rax, low ; \ + adcq %rdx, high ; \ + sbbq carry, carry + +// Initial version assuming no carry-in + +#define mulpadi(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rcx; \ + addq %rax, low ; \ + adcq %rdx, high ; \ + sbbq carry, carry + +// End version not catching the top carry-out + +#define mulpade(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rcx; \ + subq carry, %rdx ; \ + addq %rax, low ; \ + adcq %rdx, high + +S2N_BN_SYMBOL(bignum_tomont_p256_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Do row 0 computation, which is a bit different: +// set up initial window [%r12,%r11,%r10,%r9,%r8] = y[0] * x +// Unlike later, we only need a single carry chain + + movl $0x0000000000000003, %ecx + movq (x), %rax + mulq %rcx + movq %rax, %r8 + movq %rdx, %r9 + + movq 8(x), %rax + mulq %rcx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + + movq 16(x), %rax + mulq %rcx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + + movq 24(x), %rax + mulq %rcx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + +// Add row 1 + + movq $0xfffffffbffffffff, %rcx + xorl %r13d, %r13d + mulpadi(%r14,%r10,%r9,(x)) + mulpadd(%r14,%r11,%r10,8(x)) + mulpadd(%r14,%r12,%r11,16(x)) + mulpade(%r14,%r13,%r12,24(x)) + +// Montgomery reduce windows 0 and 1 together + + xorl %r14d, %r14d + movq $0x0000000100000000, %rcx + mulpadi(%r15,%r10,%r9,%r8) + mulpadd(%r15,%r11,%r10,%r9) + notq %rcx + leaq 2(%rcx), %rcx + mulpadd(%r15,%r12,%r11,%r8) + mulpade(%r15,%r13,%r12,%r9) + adcq %r14, %r14 + +// Add row 2 + + movq $0xfffffffffffffffe, %rcx + xorl %r15d, %r15d + mulpadi(%r8,%r11,%r10,(x)) + mulpadd(%r8,%r12,%r11,8(x)) + mulpadd(%r8,%r13,%r12,16(x)) + mulpade(%r8,%r14,%r13,24(x)) + adcq %r15, %r15 + +// Add row 3 + + movq $0x00000004fffffffd, %rcx + xorl %r8d, %r8d + mulpadi(%r9,%r12,%r11,(x)) + mulpadd(%r9,%r13,%r12,8(x)) + mulpadd(%r9,%r14,%r13,16(x)) + mulpade(%r9,%r15,%r14,24(x)) + adcq %r8, %r8 + +// Montgomery reduce windows 2 and 3 together + + movq $0x0000000100000000, %rcx + mulpadi(%r9,%r12,%r11,%r10) + mulpadd(%r9,%r13,%r12,%r11) + notq %rcx + leaq 2(%rcx), %rcx + mulpadd(%r9,%r14,%r13,%r10) + mulpadd(%r9,%r15,%r14,%r11) + subq %r9, %r8 + +// We now have a pre-reduced 5-word form [%r8; %r15;%r14;%r13;%r12] +// Load [%rax;%r11;%r9;%rcx;%rdx] = 2^320 - p_256, re-using earlier numbers a bit +// Do [%rax;%r11;%r9;%rcx;%rdx] = [%r8;%r15;%r14;%r13;%r12] + (2^320 - p_256) + + xorl %edx, %edx + leaq -1(%rdx), %r9 + incq %rdx + addq %r12, %rdx + decq %rcx + adcq %r13, %rcx + movq %r9, %rax + adcq %r14, %r9 + movl $0x00000000fffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + +// Now carry is set if r + (2^320 - p_256) >= 2^320, i.e. r >= p_256 +// where r is the pre-reduced form. So conditionally select the +// output accordingly. + + cmovcq %rdx, %r12 + cmovcq %rcx, %r13 + cmovcq %r9, %r14 + cmovcq %r11, %r15 + +// Write back reduced value + + movq %r12, (z) + movq %r13, 8(z) + movq %r14, 16(z) + movq %r15, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_triple_p256.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_triple_p256.S new file mode 100644 index 00000000000..0893b1004f0 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_triple_p256.S @@ -0,0 +1,132 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Triple modulo p_256, z := (3 * x) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_triple_p256 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The input x can be any 4-digit bignum, not necessarily reduced modulo p_256, +// and the result is always fully reduced, i.e. z = (3 * x) mod p_256. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256) + .text + +#define z %rdi +#define x %rsi + +// Main digits of intermediate results + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +// Quotient estimate = top of product + 1 + +#define q %rdx + +// Other temporary variables and their short version + +#define a %rax +#define c %rcx + +#define ashort %eax +#define cshort %ecx +#define qshort %edx + +S2N_BN_SYMBOL(bignum_triple_p256): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// First do the multiplication by 3, getting z = [h; d3; ...; d0] +// but immediately form the quotient estimate q = h + 1 + + xorl ashort, ashort + + movq (x), q + movq q, d0 + adcxq q, q + adoxq q, d0 + movq 8(x), q + movq q, d1 + adcxq q, q + adoxq q, d1 + movq 16(x), q + movq q, d2 + adcxq q, q + adoxq q, d2 + movq 24(x), q + movq q, d3 + adcxq q, q + adoxq q, d3 + +// For this limited range a simple quotient estimate of q = h + 1 works, where +// h = floor(z / 2^256). Then -p_256 <= z - q * p_256 < p_256, so we just need +// to subtract q * p_256 and then if that's negative, add back p_256. + + movl $1, qshort + adcxq a, q + adoxq a, q + +// Now compute the initial pre-reduced result z - p_256 * q +// = z - (2^256 - 2^224 + 2^192 + 2^96 - 1) * q +// = z - 2^192 * 0xffffffff00000001 * q - 2^64 * 0x0000000100000000 * q + q + + addq q, d0 + movq $0x0000000100000000, a + mulxq a, a, c + sbbq $0, a + sbbq $0, c + subq a, d1 + sbbq c, d2 + movq $0xffffffff00000001, a + mulxq a, a, c + sbbq a, d3 + sbbq c, q + +// q is now effectively the top word of the 5-digits result; this step +// compensates for q = h + 1 + + decq q + +// Use that as a bitmask for a masked addition of p_256 and write back + + movl $0x00000000ffffffff, ashort + andq q, a + xorl cshort, cshort + subq a, c + addq q, d0 + movq d0, (z) + adcq a, d1 + movq d1, 8(z) + adcq $0, d2 + movq d2, 16(z) + adcq c, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_triple_p256_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_triple_p256_alt.S new file mode 100644 index 00000000000..01221e75f33 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/bignum_triple_p256_alt.S @@ -0,0 +1,137 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Triple modulo p_256, z := (3 * x) mod p_256 +// Input x[4]; output z[4] +// +// extern void bignum_triple_p256_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The input x can be any 4-digit bignum, not necessarily reduced modulo p_256, +// and the result is always fully reduced, i.e. z = (3 * x) mod p_256. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256_alt) + .text + +#define z %rdi +#define x %rsi + +// Main digits of intermediate results + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +// Quotient estimate = top of product + 1 + +#define q %rcx + +// Other temporary variables and their short version + +#define a %rax +#define c %rcx +#define d %rdx + +#define ashort %eax +#define cshort %ecx +#define qshort %ecx +#define dshort %edx + +S2N_BN_SYMBOL(bignum_triple_p256_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// First do the multiplication by 3, getting z = [h; d3; ...; d0] +// but immediately form the quotient estimate q = h + 1 + + movl $3, cshort + + movq (x), a + mulq c + movq a, d0 + movq d, d1 + + movq 8(x), a + xorq d2, d2 + mulq c + addq a, d1 + adcq d, d2 + + movq 16(x), a + xorq d3, d3 + mulq c + addq a, d2 + adcq d, d3 + + movq 24(x), a + mulq c + addq a, d3 + +// For this limited range a simple quotient estimate of q = h + 1 works, where +// h = floor(z / 2^256). Then -p_256 <= z - q * p_256 < p_256, so we just need +// to subtract q * p_256 and then if that's negative, add back p_256. + + movl $1, qshort + adcq d, q + +// Now compute the initial pre-reduced result z - p_256 * q +// = z - (2^256 - 2^224 + 2^192 + 2^96 - 1) * q +// = z - 2^192 * 0xffffffff00000001 * q - 2^64 * 0x0000000100000000 * q + q +// Since q is small just use q<<32 for 0x0000000100000000 * q. + + movq $0xffffffff00000001, a + mulq q + movq q, x + shlq $32, x + addq q, d0 + sbbq $0, x + subq x, d1 + sbbq $0, d2 + sbbq a, d3 + sbbq d, q + +// q is now effectively the top word of the 5-digit result; this step +// compensates for q = h + 1 + + decq q + +// Use that as a bitmask for a masked addition of p_256 and write back + + movl $0x00000000ffffffff, ashort + andq q, a + xorl dshort, dshort + subq a, d + addq q, d0 + movq d0, (z) + adcq a, d1 + movq d1, 8(z) + adcq $0, d2 + movq d2, 16(z) + adcq d, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjadd.S new file mode 100644 index 00000000000..2363e448b8c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjadd.S @@ -0,0 +1,589 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjadd) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rbp = p2, +// which needs to be set up explicitly before use. +// The first two hold initially, and the second is +// set up by copying the initial %rdx input to %rbp. +// Thereafter, no code macro modifies any of them. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) +#define z_2 (2*NUMSIZE)(%rbp) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define x1a (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define y1a (NUMSIZE*6)(%rsp) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_montmul_p256 + +#define montmul_p256(P0,P1,P2) \ + xorl %r13d, %r13d ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rbx, %r10 ; \ + adcq %rbx, %r9 ; \ + mulxq 0x10+P1, %rbx, %r11 ; \ + adcq %rbx, %r10 ; \ + mulxq 0x18+P1, %rbx, %r12 ; \ + adcq %rbx, %r11 ; \ + adcq %r13, %r12 ; \ + movq 0x8+P2, %rdx ; \ + xorl %r14d, %r14d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcq %r14, %r13 ; \ + xorl %r15d, %r15d ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + notq %rdx; \ + leaq 0x2(%rdx), %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %r15, %r13 ; \ + adoxq %r15, %r14 ; \ + adcq %r15, %r14 ; \ + movq 0x10+P2, %rdx ; \ + xorl %r8d, %r8d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adoxq %r8, %r14 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcq %rax, %r13 ; \ + adcq %rbx, %r14 ; \ + adcq %r8, %r15 ; \ + movq 0x18+P2, %rdx ; \ + xorl %r9d, %r9d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + adoxq %r9, %r15 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcq %rax, %r14 ; \ + adcq %rbx, %r15 ; \ + adcq %r9, %r8 ; \ + xorl %r9d, %r9d ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + notq %rdx; \ + leaq 0x2(%rdx), %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %r9, %r15 ; \ + adoxq %r9, %r8 ; \ + adcq %r9, %r8 ; \ + movl $0x1, %ecx ; \ + addq %r12, %rcx ; \ + decq %rdx; \ + adcq %r13, %rdx ; \ + decq %r9; \ + movq %r9, %rax ; \ + adcq %r14, %r9 ; \ + movl $0xfffffffe, %r11d ; \ + adcq %r15, %r11 ; \ + adcq %r8, %rax ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %r9, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_montsqr_p256 except for +// register tweaks to avoid modifying %rbp. + +#define montsqr_p256(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r15 ; \ + xorl %ecx, %ecx ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + movabsq $0xffffffff00000001, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + movl %ecx, %r9d ; \ + adoxq %rcx, %r9 ; \ + adcxq %rcx, %r9 ; \ + addq %r9, %r14 ; \ + adcq %rcx, %r15 ; \ + movl %ecx, %r8d ; \ + adcq %rcx, %r8 ; \ + xorl %ecx, %ecx ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movabsq $0xffffffff00000001, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r8 ; \ + adcq %rcx, %r8 ; \ + movl $0x1, %ebx ; \ + addq %r12, %rbx ; \ + leaq -0x1(%rdx), %rdx ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rcx), %rcx ; \ + movq %rcx, %rax ; \ + adcq %r14, %rcx ; \ + movl $0xfffffffe, %r11d ; \ + adcq %r15, %r11 ; \ + adcq %r8, %rax ; \ + cmovbq %rbx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rcx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). +// Again, the basic squaring code is tweaked to avoid modifying %rbp. + +#define amontsqr_p256(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r15 ; \ + xorl %ecx, %ecx ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + movabsq $0xffffffff00000001, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + movl %ecx, %r9d ; \ + adoxq %rcx, %r9 ; \ + adcxq %rcx, %r9 ; \ + addq %r9, %r14 ; \ + adcq %rcx, %r15 ; \ + movl %ecx, %r8d ; \ + adcq %rcx, %r8 ; \ + xorl %ecx, %ecx ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movabsq $0xffffffff00000001, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r8 ; \ + adcq %rcx, %r8 ; \ + movl $0x1, %r8d ; \ + leaq -0x1(%rdx), %rdx ; \ + leaq -0x1(%rcx), %rax ; \ + movl $0xfffffffe, %r11d ; \ + cmovzq %rcx, %r8 ; \ + cmovzq %rcx, %rdx ; \ + cmovzq %rcx, %rax ; \ + cmovzq %rcx, %r11 ; \ + addq %r8, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %rax, %r14 ; \ + adcq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_sub_p256 + +#define sub_p256(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + sbbq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movl $0xffffffff, %r10d ; \ + sbbq %r11, %r11 ; \ + xorq %rdx, %rdx ; \ + andq %r11, %r10 ; \ + subq %r10, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// Additional macros to help with final multiplexing + +#define load4(r0,r1,r2,r3,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 + +#define store4(P,r0,r1,r2,r3) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P + +#define czload4(r0,r1,r2,r3,P) \ + cmovzq P, r0 ; \ + cmovzq 8+P, r1 ; \ + cmovzq 16+P, r2 ; \ + cmovzq 24+P, r3 + +#define muxload4(r0,r1,r2,r3,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 + +S2N_BN_SYMBOL(p256_montjadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input y in %rbp where it lasts as long as it's needed. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdx, %rbp + +// Main code, just a sequence of basic field operations +// 12 * multiply + 4 * square + 7 * subtract + + amontsqr_p256(z1sq,z_1) + amontsqr_p256(z2sq,z_2) + + montmul_p256(y1a,z_2,y_1) + montmul_p256(y2a,z_1,y_2) + + montmul_p256(x2a,z1sq,x_2) + montmul_p256(x1a,z2sq,x_1) + montmul_p256(y2a,z1sq,y2a) + montmul_p256(y1a,z2sq,y1a) + + sub_p256(xd,x2a,x1a) + sub_p256(yd,y2a,y1a) + + amontsqr_p256(zz,xd) + montsqr_p256(ww,yd) + + montmul_p256(zzx1,zz,x1a) + montmul_p256(zzx2,zz,x2a) + + sub_p256(resx,ww,zzx1) + sub_p256(t1,zzx2,zzx1) + + montmul_p256(xd,xd,z_1) + + sub_p256(resx,resx,zzx2) + + sub_p256(t2,zzx1,resx) + + montmul_p256(t1,t1,y1a) + + montmul_p256(resz,xd,z_2) + montmul_p256(t2,yd,t2) + + sub_p256(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) + + load4(%r8,%r9,%r10,%r11,z_1) + + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + + load4(%r12,%r13,%r14,%r15,z_2) + + movq %r12, %rbx + movq %r13, %rdx + orq %r14, %rbx + orq %r15, %rdx + orq %rdx, %rbx + negq %rbx + sbbq %rbx, %rbx + + cmpq %rax, %rbx + +// Multiplex the outputs accordingly, re-using the z's in registers + + cmovbq %r8, %r12 + cmovbq %r9, %r13 + cmovbq %r10, %r14 + cmovbq %r11, %r15 + + czload4(%r12,%r13,%r14,%r15,resz) + + muxload4(%rax,%rbx,%rcx,%rdx,resx,x_1,x_2) + muxload4(%r8,%r9,%r10,%r11,resy,y_1,y_2) + +// Finally store back the multiplexed values + + store4(x_3,%rax,%rbx,%rcx,%rdx) + store4(y_3,%r8,%r9,%r10,%r11) + store4(z_3,%r12,%r13,%r14,%r15) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjadd_alt.S new file mode 100644 index 00000000000..7dfa8b10aa3 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjadd_alt.S @@ -0,0 +1,574 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjadd_alt +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjadd_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rbp = p2, +// which needs to be set up explicitly before use. +// The first two hold initially, and the second is +// set up by copying the initial %rdx input to %rbp. +// Thereafter, no code macro modifies any of them. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) +#define z_2 (2*NUMSIZE)(%rbp) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define x1a (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define y1a (NUMSIZE*6)(%rsp) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_montmul_p256_alt + +#define montmul_p256(P0,P1,P2) \ + movq P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + xorl %r10d, %r10d ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + xorl %r11d, %r11d ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + xorl %r12d, %r12d ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 0x8+P2, %rbx ; \ + xorl %r13d, %r13d ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r14, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r14, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r14, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r14, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + xorl %r14d, %r14d ; \ + movq $0x100000000, %rbx ; \ + movq %r8, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r15, %r15 ; \ + movq %r9, %rax ; \ + mulq %rbx; \ + subq %r15, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r15, %r15 ; \ + notq %rbx; \ + leaq 0x2(%rbx), %rbx ; \ + movq %r8, %rax ; \ + mulq %rbx; \ + subq %r15, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r15, %r15 ; \ + movq %r9, %rax ; \ + mulq %rbx; \ + subq %r15, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P2, %rbx ; \ + xorl %r15d, %r15d ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r8, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r8, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P2, %rbx ; \ + xorl %r8d, %r8d ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r9, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r9, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + xorl %r9d, %r9d ; \ + movq $0x100000000, %rbx ; \ + movq %r10, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + movq %r11, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rcx, %rcx ; \ + notq %rbx; \ + leaq 0x2(%rbx), %rbx ; \ + movq %r10, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rcx, %rcx ; \ + movq %r11, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r9, %r8 ; \ + movl $0x1, %ecx ; \ + addq %r12, %rcx ; \ + decq %rbx; \ + adcq %r13, %rbx ; \ + decq %r9; \ + movq %r9, %rax ; \ + adcq %r14, %r9 ; \ + movl $0xfffffffe, %r11d ; \ + adcq %r15, %r11 ; \ + adcq %r8, %rax ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rbx, %r13 ; \ + cmovbq %r9, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_montsqr_p256_alt + +#define montsqr_p256(P0,P1) \ + movq P1, %rax ; \ + movq %rax, %rbx ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r9 ; \ + movq %rdx, %r10 ; \ + movq 0x18+P1, %rax ; \ + movq %rax, %r13 ; \ + mulq %rbx; \ + movq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + movq 0x10+P1, %rax ; \ + movq %rax, %rbx ; \ + mulq %r13; \ + movq %rax, %r13 ; \ + movq %rdx, %r14 ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + movq 0x18+P1, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorl %ecx, %ecx ; \ + addq %r9, %r9 ; \ + adcq %r10, %r10 ; \ + adcq %r11, %r11 ; \ + adcq %r12, %r12 ; \ + adcq %r13, %r13 ; \ + adcq %r14, %r14 ; \ + adcq %rcx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %r15, %r9 ; \ + adcq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + negq %r15; \ + adcq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + negq %r15; \ + adcq %rax, %r14 ; \ + adcq %rcx, %rdx ; \ + movq %rdx, %r15 ; \ + movq $0x100000000, %rbx ; \ + movq %r8, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r9, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + notq %rbx; \ + leaq 0x2(%rbx), %rbx ; \ + movq %r8, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + xorl %r8d, %r8d ; \ + movq %r9, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r8, %r14 ; \ + adcq %r8, %r15 ; \ + adcq %r8, %r8 ; \ + movq $0x100000000, %rbx ; \ + movq %r10, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + movq %r11, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rcx, %rcx ; \ + notq %rbx; \ + leaq 0x2(%rbx), %rbx ; \ + movq %r10, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rcx, %rcx ; \ + xorl %r9d, %r9d ; \ + movq %r11, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r9, %r8 ; \ + movl $0x1, %ecx ; \ + addq %r12, %rcx ; \ + leaq -0x1(%rbx), %rbx ; \ + adcq %r13, %rbx ; \ + leaq -0x1(%r9), %r9 ; \ + movq %r9, %rax ; \ + adcq %r14, %r9 ; \ + movl $0xfffffffe, %r11d ; \ + adcq %r15, %r11 ; \ + adcq %r8, %rax ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rbx, %r13 ; \ + cmovbq %r9, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_sub_p256 + +#define sub_p256(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + sbbq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movl $0xffffffff, %r10d ; \ + sbbq %r11, %r11 ; \ + xorq %rdx, %rdx ; \ + andq %r11, %r10 ; \ + subq %r10, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// Additional macros to help with final multiplexing + +#define load4(r0,r1,r2,r3,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 + +#define store4(P,r0,r1,r2,r3) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P + +#define czload4(r0,r1,r2,r3,P) \ + cmovzq P, r0 ; \ + cmovzq 8+P, r1 ; \ + cmovzq 16+P, r2 ; \ + cmovzq 24+P, r3 + +#define muxload4(r0,r1,r2,r3,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 + +S2N_BN_SYMBOL(p256_montjadd_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input y in %rbp where it lasts as long as it's needed. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdx, %rbp + +// Main code, just a sequence of basic field operations +// 12 * multiply + 4 * square + 7 * subtract + + montsqr_p256(z1sq,z_1) + montsqr_p256(z2sq,z_2) + + montmul_p256(y1a,z_2,y_1) + montmul_p256(y2a,z_1,y_2) + + montmul_p256(x2a,z1sq,x_2) + montmul_p256(x1a,z2sq,x_1) + montmul_p256(y2a,z1sq,y2a) + montmul_p256(y1a,z2sq,y1a) + + sub_p256(xd,x2a,x1a) + sub_p256(yd,y2a,y1a) + + montsqr_p256(zz,xd) + montsqr_p256(ww,yd) + + montmul_p256(zzx1,zz,x1a) + montmul_p256(zzx2,zz,x2a) + + sub_p256(resx,ww,zzx1) + sub_p256(t1,zzx2,zzx1) + + montmul_p256(xd,xd,z_1) + + sub_p256(resx,resx,zzx2) + + sub_p256(t2,zzx1,resx) + + montmul_p256(t1,t1,y1a) + + montmul_p256(resz,xd,z_2) + montmul_p256(t2,yd,t2) + + sub_p256(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) + + load4(%r8,%r9,%r10,%r11,z_1) + + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + + load4(%r12,%r13,%r14,%r15,z_2) + + movq %r12, %rbx + movq %r13, %rdx + orq %r14, %rbx + orq %r15, %rdx + orq %rdx, %rbx + negq %rbx + sbbq %rbx, %rbx + + cmpq %rax, %rbx + +// Multiplex the outputs accordingly, re-using the z's in registers + + cmovbq %r8, %r12 + cmovbq %r9, %r13 + cmovbq %r10, %r14 + cmovbq %r11, %r15 + + czload4(%r12,%r13,%r14,%r15,resz) + + muxload4(%rax,%rbx,%rcx,%rdx,resx,x_1,x_2) + muxload4(%r8,%r9,%r10,%r11,resy,y_1,y_2) + +// Finally store back the multiplexed values + + store4(x_3,%rax,%rbx,%rcx,%rdx) + store4(y_3,%r8,%r9,%r10,%r11) + store4(z_3,%r12,%r13,%r14,%r15) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjdouble.S new file mode 100644 index 00000000000..ef0904c25dc --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjdouble.S @@ -0,0 +1,630 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjdouble +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjdouble) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1, which is true when the +// arguments come in initially and is not disturbed throughout. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 (NUMSIZE*0)(%rsp) +#define y4 (NUMSIZE*0)(%rsp) + +#define y2 (NUMSIZE*1)(%rsp) + +#define t1 (NUMSIZE*2)(%rsp) + +#define t2 (NUMSIZE*3)(%rsp) +#define x2p (NUMSIZE*3)(%rsp) +#define dx2 (NUMSIZE*3)(%rsp) + +#define xy2 (NUMSIZE*4)(%rsp) + +#define x4p (NUMSIZE*5)(%rsp) +#define d (NUMSIZE*5)(%rsp) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_montmul_p256 + +#define montmul_p256(P0,P1,P2) \ + xorl %r13d, %r13d ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rbx, %r10 ; \ + adcq %rbx, %r9 ; \ + mulxq 0x10+P1, %rbx, %r11 ; \ + adcq %rbx, %r10 ; \ + mulxq 0x18+P1, %rbx, %r12 ; \ + adcq %rbx, %r11 ; \ + adcq %r13, %r12 ; \ + movq 0x8+P2, %rdx ; \ + xorl %r14d, %r14d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcq %r14, %r13 ; \ + xorl %r15d, %r15d ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + notq %rdx; \ + leaq 0x2(%rdx), %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %r15, %r13 ; \ + adoxq %r15, %r14 ; \ + adcq %r15, %r14 ; \ + movq 0x10+P2, %rdx ; \ + xorl %r8d, %r8d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adoxq %r8, %r14 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcq %rax, %r13 ; \ + adcq %rbx, %r14 ; \ + adcq %r8, %r15 ; \ + movq 0x18+P2, %rdx ; \ + xorl %r9d, %r9d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + adoxq %r9, %r15 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcq %rax, %r14 ; \ + adcq %rbx, %r15 ; \ + adcq %r9, %r8 ; \ + xorl %r9d, %r9d ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + notq %rdx; \ + leaq 0x2(%rdx), %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %r9, %r15 ; \ + adoxq %r9, %r8 ; \ + adcq %r9, %r8 ; \ + movl $0x1, %ecx ; \ + addq %r12, %rcx ; \ + decq %rdx; \ + adcq %r13, %rdx ; \ + decq %r9; \ + movq %r9, %rax ; \ + adcq %r14, %r9 ; \ + movl $0xfffffffe, %r11d ; \ + adcq %r15, %r11 ; \ + adcq %r8, %rax ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %r9, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_montsqr_p256 + +#define montsqr_p256(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ebp, %ebp ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rbp, %r13 ; \ + adoxq %rbp, %r14 ; \ + adcq %rbp, %r14 ; \ + xorl %ebp, %ebp ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rbp, %r15 ; \ + adoxq %rbp, %r15 ; \ + xorl %ebp, %ebp ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + movabsq $0xffffffff00000001, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rbp, %r13 ; \ + movl %ebp, %r9d ; \ + adoxq %rbp, %r9 ; \ + adcxq %rbp, %r9 ; \ + addq %r9, %r14 ; \ + adcq %rbp, %r15 ; \ + movl %ebp, %r8d ; \ + adcq %rbp, %r8 ; \ + xorl %ebp, %ebp ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movabsq $0xffffffff00000001, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %rbp, %r15 ; \ + adoxq %rbp, %r8 ; \ + adcq %rbp, %r8 ; \ + movl $0x1, %ecx ; \ + addq %r12, %rcx ; \ + leaq -0x1(%rdx), %rdx ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbp), %rbp ; \ + movq %rbp, %rax ; \ + adcq %r14, %rbp ; \ + movl $0xfffffffe, %r11d ; \ + adcq %r15, %r11 ; \ + adcq %r8, %rax ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbp, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_sub_p256 + +#define sub_p256(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + sbbq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movl $0xffffffff, %r10d ; \ + sbbq %r11, %r11 ; \ + xorq %rdx, %rdx ; \ + andq %r11, %r10 ; \ + subq %r10, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// Corresponds exactly to bignum_add_p256 + +#define add_p256(P0,P1,P2) \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + adcq %r11, %r11 ; \ + subq $0xffffffffffffffff, %rax ; \ + movl $0xffffffff, %r10d ; \ + sbbq %r10, %rcx ; \ + sbbq $0x0, %r8 ; \ + movq $0xffffffff00000001, %rdx ; \ + sbbq %rdx, %r9 ; \ + sbbq $0x0, %r11 ; \ + andq %r11, %r10 ; \ + andq %r11, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// A weak version of add that only guarantees sum in 4 digits + +#define weakadd_p256(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movl $0xffffffff, %r10d ; \ + sbbq %r11, %r11 ; \ + xorq %rdx, %rdx ; \ + andq %r11, %r10 ; \ + subq %r10, %rdx ; \ + subq %r11, %rax ; \ + movq %rax, P0 ; \ + sbbq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// P0 = C * P1 - D * P2 computed as d * (p_256 - P2) + c * P1 +// Quotient estimation is done just as q = h + 1 as in bignum_triple_p256 +// This also applies to the other functions following. + +#define cmsub_p256(P0,C,P1,D,P2) \ + /* First (%r11;%r10;%r9;%r8) = p_256 - P2 */ \ + movq $0xffffffffffffffff, %r8 ; \ + xorl %r10d, %r10d ; \ + subq P2, %r8 ; \ + movq $0x00000000ffffffff, %r9 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + movq $0xffffffff00000001, %r11 ; \ + sbbq 0x18+P2, %r11 ; \ + /* (%r12;%r11;%r10;%r9;%r8) = D * (p_256 - P2) */ \ + xorl %r12d, %r12d ; \ + movq $D, %rdx ; \ + mulxq %r8, %r8, %rax ; \ + mulxq %r9, %r9, %rcx ; \ + addq %rax, %r9 ; \ + mulxq %r10, %r10, %rax ; \ + adcq %rcx, %r10 ; \ + mulxq %r11, %r11, %rcx ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + /* (%rdx;%r11;%r10;%r9;%r8) = 2^256 + C * P1 + D * (p_256 - P2) */ \ + movq $C, %rdx ; \ + xorl %eax, %eax ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq 0x10+P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x18+P1, %rax, %rdx ; \ + adcxq %rax, %r11 ; \ + adoxq %r12, %rdx ; \ + adcq $1, %rdx ; \ + /* Now the tail for modular reduction from tripling */ \ + addq %rdx, %r8 ; \ + movq $0x100000000, %rax ; \ + mulxq %rax, %rax, %rcx ; \ + sbbq $0x0, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rax, %rcx ; \ + sbbq %rax, %r11 ; \ + sbbq %rcx, %rdx ; \ + decq %rdx; \ + movl $0xffffffff, %eax ; \ + andq %rdx, %rax ; \ + xorl %ecx, %ecx ; \ + subq %rax, %rcx ; \ + addq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 0x18+P0 + +// P0 = 3 * P1 - 8 * P2, computed as (p_256 - P2) << 3 + 3 * P1 + +#define cmsub38_p256(P0,P1,P2) \ + /* First (%r11;%r10;%r9;%r8) = p_256 - P2 */ \ + movq $0xffffffffffffffff, %r8 ; \ + xorl %r10d, %r10d ; \ + subq P2, %r8 ; \ + movq $0x00000000ffffffff, %r9 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + movq $0xffffffff00000001, %r11 ; \ + sbbq 0x18+P2, %r11 ; \ + /* (%r12;%r11;%r10;%r9;%r8) = (p_256 - P2) << 3 */ \ + movq %r11, %r12 ; \ + shldq $3, %r10, %r11 ; \ + shldq $3, %r9, %r10 ; \ + shldq $3, %r8, %r9 ; \ + shlq $3, %r8 ; \ + shrq $61, %r12 ; \ + /* (%rdx;%r11;%r10;%r9;%r8) = 2^256 + 3 * P1 + 8 * (p_256 - P2) */ \ + movq $3, %rdx ; \ + xorl %eax, %eax ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq 0x10+P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x18+P1, %rax, %rdx ; \ + adcxq %rax, %r11 ; \ + adoxq %r12, %rdx ; \ + adcq $1, %rdx ; \ + /* Now the tail for modular reduction from tripling */ \ + addq %rdx, %r8 ; \ + movq $0x100000000, %rax ; \ + mulxq %rax, %rax, %rcx ; \ + sbbq $0x0, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rax, %rcx ; \ + sbbq %rax, %r11 ; \ + sbbq %rcx, %rdx ; \ + decq %rdx; \ + movl $0xffffffff, %eax ; \ + andq %rdx, %rax ; \ + xorl %ecx, %ecx ; \ + subq %rax, %rcx ; \ + addq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 0x18+P0 + +// P0 = 4 * P1 - P2, by direct subtraction of P2, +// since the quotient estimate still works safely +// for initial value > -p_256 + +#define cmsub41_p256(P0,P1,P2) \ + movq 0x18+P1, %r11 ; \ + movq %r11, %rdx ; \ + movq 0x10+P1, %r10 ; \ + shldq $2, %r10, %r11 ; \ + movq 0x8+P1, %r9 ; \ + shldq $2, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $2, %r8, %r9 ; \ + shlq $2, %r8 ; \ + shrq $62, %rdx ; \ + addq $1, %rdx ; \ + subq P2, %r8 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + sbbq 0x18+P2, %r11 ; \ + sbbq $0, %rdx ; \ + /* Now the tail for modular reduction from tripling */ \ + addq %rdx, %r8 ; \ + movq $0x100000000, %rax ; \ + mulxq %rax, %rax, %rcx ; \ + sbbq $0x0, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + movq $0xffffffff00000001, %rax ; \ + mulxq %rax, %rax, %rcx ; \ + sbbq %rax, %r11 ; \ + sbbq %rcx, %rdx ; \ + decq %rdx; \ + movl $0xffffffff, %eax ; \ + andq %rdx, %rax ; \ + xorl %ecx, %ecx ; \ + subq %rax, %rcx ; \ + addq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 0x18+P0 + +S2N_BN_SYMBOL(p256_montjdouble): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_p256(z2,z_1) + montsqr_p256(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + sub_p256(t2,x_1,z2) + weakadd_p256(t1,x_1,z2) + montmul_p256(x2p,t1,t2) + +// t1 = y + z +// xy2 = x * y^2 +// x4p = x2p^2 + + add_p256(t1,y_1,z_1) + montmul_p256(xy2,x_1,y2) + montsqr_p256(x4p,x2p) + +// t1 = (y + z)^2 + + montsqr_p256(t1,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p256(d,12,xy2,9,x4p) + sub_p256(t1,t1,z2) + +// y4 = y^4 + + montsqr_p256(y4,y2) + +// dx2 = d * x2p + + montmul_p256(dx2,d,x2p) + +// z_3' = 2 * y * z + + sub_p256(z_3,t1,y2) + +// x' = 4 * xy2 - d + + cmsub41_p256(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p256(y_3,dx2,y4) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjdouble_alt.S new file mode 100644 index 00000000000..6a3a5e630d9 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjdouble_alt.S @@ -0,0 +1,743 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjdouble_alt +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjdouble_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1, which is true when the +// arguments come in initially and is not disturbed throughout. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 (NUMSIZE*0)(%rsp) +#define y4 (NUMSIZE*0)(%rsp) + +#define y2 (NUMSIZE*1)(%rsp) + +#define t1 (NUMSIZE*2)(%rsp) + +#define t2 (NUMSIZE*3)(%rsp) +#define x2p (NUMSIZE*3)(%rsp) +#define dx2 (NUMSIZE*3)(%rsp) + +#define xy2 (NUMSIZE*4)(%rsp) + +#define x4p (NUMSIZE*5)(%rsp) +#define d (NUMSIZE*5)(%rsp) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_montmul_p256_alt + +#define montmul_p256(P0,P1,P2) \ + movq P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + xorl %r10d, %r10d ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + xorl %r11d, %r11d ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + xorl %r12d, %r12d ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 0x8+P2, %rbx ; \ + xorl %r13d, %r13d ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r14, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r14, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r14, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r14, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + xorl %r14d, %r14d ; \ + movq $0x100000000, %rbx ; \ + movq %r8, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r15, %r15 ; \ + movq %r9, %rax ; \ + mulq %rbx; \ + subq %r15, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r15, %r15 ; \ + notq %rbx; \ + leaq 0x2(%rbx), %rbx ; \ + movq %r8, %rax ; \ + mulq %rbx; \ + subq %r15, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r15, %r15 ; \ + movq %r9, %rax ; \ + mulq %rbx; \ + subq %r15, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P2, %rbx ; \ + xorl %r15d, %r15d ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r8, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r8, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P2, %rbx ; \ + xorl %r8d, %r8d ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r9, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r9, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + xorl %r9d, %r9d ; \ + movq $0x100000000, %rbx ; \ + movq %r10, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + movq %r11, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rcx, %rcx ; \ + notq %rbx; \ + leaq 0x2(%rbx), %rbx ; \ + movq %r10, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rcx, %rcx ; \ + movq %r11, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r9, %r8 ; \ + movl $0x1, %ecx ; \ + addq %r12, %rcx ; \ + decq %rbx; \ + adcq %r13, %rbx ; \ + decq %r9; \ + movq %r9, %rax ; \ + adcq %r14, %r9 ; \ + movl $0xfffffffe, %r11d ; \ + adcq %r15, %r11 ; \ + adcq %r8, %rax ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rbx, %r13 ; \ + cmovbq %r9, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_montsqr_p256_alt + +#define montsqr_p256(P0,P1) \ + movq P1, %rax ; \ + movq %rax, %rbx ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r9 ; \ + movq %rdx, %r10 ; \ + movq 0x18+P1, %rax ; \ + movq %rax, %r13 ; \ + mulq %rbx; \ + movq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + movq 0x10+P1, %rax ; \ + movq %rax, %rbx ; \ + mulq %r13; \ + movq %rax, %r13 ; \ + movq %rdx, %r14 ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + movq 0x18+P1, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorl %ecx, %ecx ; \ + addq %r9, %r9 ; \ + adcq %r10, %r10 ; \ + adcq %r11, %r11 ; \ + adcq %r12, %r12 ; \ + adcq %r13, %r13 ; \ + adcq %r14, %r14 ; \ + adcq %rcx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %r15, %r9 ; \ + adcq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + negq %r15; \ + adcq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + negq %r15; \ + adcq %rax, %r14 ; \ + adcq %rcx, %rdx ; \ + movq %rdx, %r15 ; \ + movq $0x100000000, %rbx ; \ + movq %r8, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r9, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + notq %rbx; \ + leaq 0x2(%rbx), %rbx ; \ + movq %r8, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + xorl %r8d, %r8d ; \ + movq %r9, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r8, %r14 ; \ + adcq %r8, %r15 ; \ + adcq %r8, %r8 ; \ + movq $0x100000000, %rbx ; \ + movq %r10, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + movq %r11, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rcx, %rcx ; \ + notq %rbx; \ + leaq 0x2(%rbx), %rbx ; \ + movq %r10, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rcx, %rcx ; \ + xorl %r9d, %r9d ; \ + movq %r11, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r9, %r8 ; \ + movl $0x1, %ecx ; \ + addq %r12, %rcx ; \ + leaq -0x1(%rbx), %rbx ; \ + adcq %r13, %rbx ; \ + leaq -0x1(%r9), %r9 ; \ + movq %r9, %rax ; \ + adcq %r14, %r9 ; \ + movl $0xfffffffe, %r11d ; \ + adcq %r15, %r11 ; \ + adcq %r8, %rax ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rbx, %r13 ; \ + cmovbq %r9, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_sub_p256 + +#define sub_p256(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + sbbq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movl $0xffffffff, %r10d ; \ + sbbq %r11, %r11 ; \ + xorq %rdx, %rdx ; \ + andq %r11, %r10 ; \ + subq %r10, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// Corresponds exactly to bignum_add_p256 + +#define add_p256(P0,P1,P2) \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + adcq %r11, %r11 ; \ + subq $0xffffffffffffffff, %rax ; \ + movl $0xffffffff, %r10d ; \ + sbbq %r10, %rcx ; \ + sbbq $0x0, %r8 ; \ + movq $0xffffffff00000001, %rdx ; \ + sbbq %rdx, %r9 ; \ + sbbq $0x0, %r11 ; \ + andq %r11, %r10 ; \ + andq %r11, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// A weak version of add that only guarantees sum in 4 digits + +#define weakadd_p256(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movl $0xffffffff, %r10d ; \ + sbbq %r11, %r11 ; \ + xorq %rdx, %rdx ; \ + andq %r11, %r10 ; \ + subq %r10, %rdx ; \ + subq %r11, %rax ; \ + movq %rax, P0 ; \ + sbbq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + sbbq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// P0 = C * P1 - D * P2 computed as d * (p_256 - P2) + c * P1 +// Quotient estimation is done just as q = h + 1 as in bignum_triple_p256_alt. +// This also applies to the other functions following. + +#define cmsub_p256(P0,C,P1,D,P2) \ + /* First (%r12;%r11;%r10;%r9) = p_256 - P2 */ \ + movq $0xffffffffffffffff, %r9 ; \ + xorl %r11d, %r11d ; \ + subq P2, %r9 ; \ + movq $0x00000000ffffffff, %r10 ; \ + sbbq 0x8+P2, %r10 ; \ + sbbq 0x10+P2, %r11 ; \ + movq $0xffffffff00000001, %r12 ; \ + sbbq 0x18+P2, %r12 ; \ + /* (%r12;%r11;%r10;%r9;%r8) = D * (p_256 - P2) */ \ + movq $D, %rcx ; \ + movq %r9, %rax ; \ + mulq %rcx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq %r10, %rax ; \ + xorl %r10d, %r10d ; \ + mulq %rcx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq %r11, %rax ; \ + xorl %r11d, %r11d ; \ + mulq %rcx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq %r12, %rax ; \ + xorl %r12d, %r12d ; \ + mulq %rcx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + /* (%rcx;%r11;%r10;%r9;%r8) = 2^256 + C * P1 + D * (p_256 - P2) */ \ + movl $C, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbx, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbx, %rbx ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbx, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + leaq 1(%r12), %rcx ; \ + /* Now the tail for modular reduction from tripling */ \ + movq $0xffffffff00000001, %rax ; \ + mulq %rcx; \ + movq %rcx, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %rcx, %r8 ; \ + sbbq $0x0, %rbx ; \ + subq %rbx, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq %rax, %r11 ; \ + sbbq %rdx, %rcx ; \ + decq %rcx; \ + movl $0xffffffff, %eax ; \ + andq %rcx, %rax ; \ + xorl %edx, %edx ; \ + subq %rax, %rdx ; \ + addq %rcx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq %rdx, %r11 ; \ + movq %r11, 0x18+P0 + +// P0 = 3 * P1 - 8 * P2, computed as (p_256 - P2) << 3 + 3 * P1 + +#define cmsub38_p256(P0,P1,P2) \ + /* First (%r11;%r10;%r9;%r8) = p_256 - P2 */ \ + movq $0xffffffffffffffff, %r8 ; \ + xorl %r10d, %r10d ; \ + subq P2, %r8 ; \ + movq $0x00000000ffffffff, %r9 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + movq $0xffffffff00000001, %r11 ; \ + sbbq 0x18+P2, %r11 ; \ + /* (%r12;%r11;%r10;%r9;%r8) = (p_256 - P2) << 3 */ \ + movq %r11, %r12 ; \ + shldq $3, %r10, %r11 ; \ + shldq $3, %r9, %r10 ; \ + shldq $3, %r8, %r9 ; \ + shlq $3, %r8 ; \ + shrq $61, %r12 ; \ + /* (%rcx;%r11;%r10;%r9;%r8) = 2^256 + 3 * P1 + 8 * (p_256 - P2) */ \ + movl $3, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbx, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbx, %rbx ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbx, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + leaq 1(%r12), %rcx ; \ + /* Now the tail for modular reduction from tripling */ \ + movq $0xffffffff00000001, %rax ; \ + mulq %rcx; \ + movq %rcx, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %rcx, %r8 ; \ + sbbq $0x0, %rbx ; \ + subq %rbx, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq %rax, %r11 ; \ + sbbq %rdx, %rcx ; \ + decq %rcx; \ + movl $0xffffffff, %eax ; \ + andq %rcx, %rax ; \ + xorl %edx, %edx ; \ + subq %rax, %rdx ; \ + addq %rcx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq %rdx, %r11 ; \ + movq %r11, 0x18+P0 + +// P0 = 4 * P1 - P2, by direct subtraction of P2, +// since the quotient estimate still works safely +// for initial value > -p_256 + +#define cmsub41_p256(P0,P1,P2) \ + movq 0x18+P1, %r11 ; \ + movq %r11, %rcx ; \ + movq 0x10+P1, %r10 ; \ + shldq $2, %r10, %r11 ; \ + movq 0x8+P1, %r9 ; \ + shldq $2, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $2, %r8, %r9 ; \ + shlq $2, %r8 ; \ + shrq $62, %rcx ; \ + addq $1, %rcx ; \ + subq P2, %r8 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + sbbq 0x18+P2, %r11 ; \ + sbbq $0, %rcx ; \ + /* Now the tail for modular reduction from tripling */ \ + movq $0xffffffff00000001, %rax ; \ + mulq %rcx; \ + movq %rcx, %rbx ; \ + shlq $0x20, %rbx ; \ + addq %rcx, %r8 ; \ + sbbq $0x0, %rbx ; \ + subq %rbx, %r9 ; \ + sbbq $0x0, %r10 ; \ + sbbq %rax, %r11 ; \ + sbbq %rdx, %rcx ; \ + decq %rcx; \ + movl $0xffffffff, %eax ; \ + andq %rcx, %rax ; \ + xorl %edx, %edx ; \ + subq %rax, %rdx ; \ + addq %rcx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq $0x0, %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq %rdx, %r11 ; \ + movq %r11, 0x18+P0 + +S2N_BN_SYMBOL(p256_montjdouble_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_p256(z2,z_1) + montsqr_p256(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + sub_p256(t2,x_1,z2) + weakadd_p256(t1,x_1,z2) + montmul_p256(x2p,t1,t2) + +// t1 = y + z +// xy2 = x * y^2 +// x4p = x2p^2 + + add_p256(t1,y_1,z_1) + montmul_p256(xy2,x_1,y2) + montsqr_p256(x4p,x2p) + +// t1 = (y + z)^2 + + montsqr_p256(t1,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_p256(d,12,xy2,9,x4p) + sub_p256(t1,t1,z2) + +// y4 = y^4 + + montsqr_p256(y4,y2) + +// dx2 = d * x2p + + montmul_p256(dx2,d,x2p) + +// z_3' = 2 * y * z + + sub_p256(z_3,t1,y2) + +// x' = 4 * xy2 - d + + cmsub41_p256(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_p256(y_3,dx2,y4) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjmixadd.S new file mode 100644 index 00000000000..51b1f4923b7 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjmixadd.S @@ -0,0 +1,562 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjmixadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjmixadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjmixadd) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs. +// These assume %rdi = p3, %rsi = p1 and %rbp = p2, +// which needs to be set up explicitly before use. +// The first two hold initially, and the second is +// set up by copying the initial %rdx input to %rbp. +// Thereafter, no code macro modifies any of them. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing. +// NSPACE is the total stack needed for all temporaries. + +#define zp2 (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_montmul_p256 + +#define montmul_p256(P0,P1,P2) \ + xorl %r13d, %r13d ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rbx, %r10 ; \ + adcq %rbx, %r9 ; \ + mulxq 0x10+P1, %rbx, %r11 ; \ + adcq %rbx, %r10 ; \ + mulxq 0x18+P1, %rbx, %r12 ; \ + adcq %rbx, %r11 ; \ + adcq %r13, %r12 ; \ + movq 0x8+P2, %rdx ; \ + xorl %r14d, %r14d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcq %r14, %r13 ; \ + xorl %r15d, %r15d ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + notq %rdx; \ + leaq 0x2(%rdx), %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %r15, %r13 ; \ + adoxq %r15, %r14 ; \ + adcq %r15, %r14 ; \ + movq 0x10+P2, %rdx ; \ + xorl %r8d, %r8d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adoxq %r8, %r14 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcq %rax, %r13 ; \ + adcq %rbx, %r14 ; \ + adcq %r8, %r15 ; \ + movq 0x18+P2, %rdx ; \ + xorl %r9d, %r9d ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + adoxq %r9, %r15 ; \ + mulxq 0x18+P1, %rax, %rbx ; \ + adcq %rax, %r14 ; \ + adcq %rbx, %r15 ; \ + adcq %r9, %r8 ; \ + xorl %r9d, %r9d ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + notq %rdx; \ + leaq 0x2(%rdx), %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %r9, %r15 ; \ + adoxq %r9, %r8 ; \ + adcq %r9, %r8 ; \ + movl $0x1, %ecx ; \ + addq %r12, %rcx ; \ + decq %rdx; \ + adcq %r13, %rdx ; \ + decq %r9; \ + movq %r9, %rax ; \ + adcq %r14, %r9 ; \ + movl $0xfffffffe, %r11d ; \ + adcq %r15, %r11 ; \ + adcq %r8, %rax ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %r9, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_montsqr_p256 except for +// register tweaks to avoid modifying %rbp. + +#define montsqr_p256(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r15 ; \ + xorl %ecx, %ecx ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + movabsq $0xffffffff00000001, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + movl %ecx, %r9d ; \ + adoxq %rcx, %r9 ; \ + adcxq %rcx, %r9 ; \ + addq %r9, %r14 ; \ + adcq %rcx, %r15 ; \ + movl %ecx, %r8d ; \ + adcq %rcx, %r8 ; \ + xorl %ecx, %ecx ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movabsq $0xffffffff00000001, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r8 ; \ + adcq %rcx, %r8 ; \ + movl $0x1, %ebx ; \ + addq %r12, %rbx ; \ + leaq -0x1(%rdx), %rdx ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rcx), %rcx ; \ + movq %rcx, %rax ; \ + adcq %r14, %rcx ; \ + movl $0xfffffffe, %r11d ; \ + adcq %r15, %r11 ; \ + adcq %r8, %rax ; \ + cmovbq %rbx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rcx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). +// Again, the basic squaring code is tweaked to avoid modifying %rbp. + +#define amontsqr_p256(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r15 ; \ + xorl %ecx, %ecx ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + movabsq $0xffffffff00000001, %rdx ; \ + mulxq %r8, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r9, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + movl %ecx, %r9d ; \ + adoxq %rcx, %r9 ; \ + adcxq %rcx, %r9 ; \ + addq %r9, %r14 ; \ + adcq %rcx, %r15 ; \ + movl %ecx, %r8d ; \ + adcq %rcx, %r8 ; \ + xorl %ecx, %ecx ; \ + movabsq $0x100000000, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + movabsq $0xffffffff00000001, %rdx ; \ + mulxq %r10, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq %r11, %rax, %rbx ; \ + adcxq %rax, %r14 ; \ + adoxq %rbx, %r15 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r8 ; \ + adcq %rcx, %r8 ; \ + movl $0x1, %r8d ; \ + leaq -0x1(%rdx), %rdx ; \ + leaq -0x1(%rcx), %rax ; \ + movl $0xfffffffe, %r11d ; \ + cmovzq %rcx, %r8 ; \ + cmovzq %rcx, %rdx ; \ + cmovzq %rcx, %rax ; \ + cmovzq %rcx, %r11 ; \ + addq %r8, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %rax, %r14 ; \ + adcq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_sub_p256 + +#define sub_p256(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + sbbq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movl $0xffffffff, %r10d ; \ + sbbq %r11, %r11 ; \ + xorq %rdx, %rdx ; \ + andq %r11, %r10 ; \ + subq %r10, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// Additional macros to help with final multiplexing + +#define testzero4(P) \ + movq P, %rax ; \ + movq 8+P, %rdx ; \ + orq 16+P, %rax ; \ + orq 24+P, %rdx ; \ + orq %rdx, %rax + +#define mux4(r0,r1,r2,r3,PNE,PEQ) \ + movq PNE, r0 ; \ + movq PEQ, %rax ; \ + cmovzq %rax, r0 ; \ + movq 8+PNE, r1 ; \ + movq 8+PEQ, %rax ; \ + cmovzq %rax, r1 ; \ + movq 16+PNE, r2 ; \ + movq 16+PEQ, %rax ; \ + cmovzq %rax, r2 ; \ + movq 24+PNE, r3 ; \ + movq 24+PEQ, %rax ; \ + cmovzq %rax, r3 + +#define load4(r0,r1,r2,r3,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 + +#define store4(P,r0,r1,r2,r3) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P + +S2N_BN_SYMBOL(p256_montjmixadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input y in %rbp where it lasts as long as it's needed. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdx, %rbp + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_p256(zp2,z_1) + + montmul_p256(y2a,z_1,y_2) + montmul_p256(x2a,zp2,x_2) + montmul_p256(y2a,zp2,y2a) + + sub_p256(xd,x2a,x_1) + + sub_p256(yd,y2a,y_1) + + amontsqr_p256(zz,xd) + montsqr_p256(ww,yd) + + montmul_p256(zzx1,zz,x_1) + montmul_p256(zzx2,zz,x2a) + + sub_p256(resx,ww,zzx1) + sub_p256(t1,zzx2,zzx1) + + montmul_p256(resz,xd,z_1) + + sub_p256(resx,resx,zzx2) + + sub_p256(t2,zzx1,resx) + + montmul_p256(t1,t1,y_1) + montmul_p256(t2,yd,t2) + + sub_p256(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + testzero4(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^256 - p_256), +// hence giving 0 + p2 = p2 for the final result. + + mux4(%r8,%r9,%r10,%r11,resx,x_2) + mux4(%r12,%r13,%r14,%r15,resy,y_2) + + store4(x_3,%r8,%r9,%r10,%r11) + store4(y_3,%r12,%r13,%r14,%r15) + + load4(%r8,%r9,%r10,%r11,resz) + movl $1, %eax + cmovzq %rax, %r8 + movq $0xffffffff00000000, %rax + cmovzq %rax, %r9 + movq $0xffffffffffffffff, %rax + cmovzq %rax, %r10 + movl $0x00000000fffffffe, %eax + cmovzq %rax, %r11 + + store4(z_3,%r8,%r9,%r10,%r11) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjmixadd_alt.S new file mode 100644 index 00000000000..55f2dca1d3e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjmixadd_alt.S @@ -0,0 +1,547 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on NIST curve P-256 in Montgomery-Jacobian coordinates +// +// extern void p256_montjmixadd_alt +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_256. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjmixadd_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs. +// These assume %rdi = p3, %rsi = p1 and %rbp = p2, +// which needs to be set up explicitly before use. +// The first two hold initially, and the second is +// set up by copying the initial %rdx input to %rbp. +// Thereafter, no code macro modifies any of them. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing. +// NSPACE is the total stack needed for all temporaries. + +#define zp2 (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_montmul_p256_alt + +#define montmul_p256(P0,P1,P2) \ + movq P2, %rbx ; \ + movq P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + xorl %r10d, %r10d ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + xorl %r11d, %r11d ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + xorl %r12d, %r12d ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 0x8+P2, %rbx ; \ + xorl %r13d, %r13d ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r14, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r14, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r14, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r14, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + xorl %r14d, %r14d ; \ + movq $0x100000000, %rbx ; \ + movq %r8, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %r15, %r15 ; \ + movq %r9, %rax ; \ + mulq %rbx; \ + subq %r15, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r15, %r15 ; \ + notq %rbx; \ + leaq 0x2(%rbx), %rbx ; \ + movq %r8, %rax ; \ + mulq %rbx; \ + subq %r15, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r15, %r15 ; \ + movq %r9, %rax ; \ + mulq %rbx; \ + subq %r15, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P2, %rbx ; \ + xorl %r15d, %r15d ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r8, %r8 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r8, %r8 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r8, %r8 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r8, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P2, %rbx ; \ + xorl %r8d, %r8d ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %r9, %r9 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r9, %r9 ; \ + movq 0x10+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %r9, %r9 ; \ + movq 0x18+P1, %rax ; \ + mulq %rbx; \ + subq %r9, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r8, %r8 ; \ + xorl %r9d, %r9d ; \ + movq $0x100000000, %rbx ; \ + movq %r10, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + movq %r11, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rcx, %rcx ; \ + notq %rbx; \ + leaq 0x2(%rbx), %rbx ; \ + movq %r10, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rcx, %rcx ; \ + movq %r11, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r9, %r8 ; \ + movl $0x1, %ecx ; \ + addq %r12, %rcx ; \ + decq %rbx; \ + adcq %r13, %rbx ; \ + decq %r9; \ + movq %r9, %rax ; \ + adcq %r14, %r9 ; \ + movl $0xfffffffe, %r11d ; \ + adcq %r15, %r11 ; \ + adcq %r8, %rax ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rbx, %r13 ; \ + cmovbq %r9, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_montsqr_p256_alt + +#define montsqr_p256(P0,P1) \ + movq P1, %rax ; \ + movq %rax, %rbx ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r9 ; \ + movq %rdx, %r10 ; \ + movq 0x18+P1, %rax ; \ + movq %rax, %r13 ; \ + mulq %rbx; \ + movq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + movq 0x10+P1, %rax ; \ + movq %rax, %rbx ; \ + mulq %r13; \ + movq %rax, %r13 ; \ + movq %rdx, %r14 ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + movq 0x18+P1, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorl %ecx, %ecx ; \ + addq %r9, %r9 ; \ + adcq %r10, %r10 ; \ + adcq %r11, %r11 ; \ + adcq %r12, %r12 ; \ + adcq %r13, %r13 ; \ + adcq %r14, %r14 ; \ + adcq %rcx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %r15, %r9 ; \ + adcq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + negq %r15; \ + adcq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + negq %r15; \ + adcq %rax, %r14 ; \ + adcq %rcx, %rdx ; \ + movq %rdx, %r15 ; \ + movq $0x100000000, %rbx ; \ + movq %r8, %rax ; \ + mulq %rbx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r9, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + notq %rbx; \ + leaq 0x2(%rbx), %rbx ; \ + movq %r8, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + xorl %r8d, %r8d ; \ + movq %r9, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r8, %r14 ; \ + adcq %r8, %r15 ; \ + adcq %r8, %r8 ; \ + movq $0x100000000, %rbx ; \ + movq %r10, %rax ; \ + mulq %rbx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + movq %r11, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %rcx, %rcx ; \ + notq %rbx; \ + leaq 0x2(%rbx), %rbx ; \ + movq %r10, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + sbbq %rcx, %rcx ; \ + xorl %r9d, %r9d ; \ + movq %r11, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + adcq %r9, %r8 ; \ + movl $0x1, %ecx ; \ + addq %r12, %rcx ; \ + leaq -0x1(%rbx), %rbx ; \ + adcq %r13, %rbx ; \ + leaq -0x1(%r9), %r9 ; \ + movq %r9, %rax ; \ + adcq %r14, %r9 ; \ + movl $0xfffffffe, %r11d ; \ + adcq %r15, %r11 ; \ + adcq %r8, %rax ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rbx, %r13 ; \ + cmovbq %r9, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_sub_p256 + +#define sub_p256(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + sbbq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movl $0xffffffff, %r10d ; \ + sbbq %r11, %r11 ; \ + xorq %rdx, %rdx ; \ + andq %r11, %r10 ; \ + subq %r10, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq $0x0, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// Additional macros to help with final multiplexing + +#define testzero4(P) \ + movq P, %rax ; \ + movq 8+P, %rdx ; \ + orq 16+P, %rax ; \ + orq 24+P, %rdx ; \ + orq %rdx, %rax + +#define mux4(r0,r1,r2,r3,PNE,PEQ) \ + movq PNE, r0 ; \ + movq PEQ, %rax ; \ + cmovzq %rax, r0 ; \ + movq 8+PNE, r1 ; \ + movq 8+PEQ, %rax ; \ + cmovzq %rax, r1 ; \ + movq 16+PNE, r2 ; \ + movq 16+PEQ, %rax ; \ + cmovzq %rax, r2 ; \ + movq 24+PNE, r3 ; \ + movq 24+PEQ, %rax ; \ + cmovzq %rax, r3 + +#define load4(r0,r1,r2,r3,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 + +#define store4(P,r0,r1,r2,r3) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P + +S2N_BN_SYMBOL(p256_montjmixadd_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input y in %rbp where it lasts as long as it's needed. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdx, %rbp + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + montsqr_p256(zp2,z_1) + + montmul_p256(y2a,z_1,y_2) + montmul_p256(x2a,zp2,x_2) + montmul_p256(y2a,zp2,y2a) + + sub_p256(xd,x2a,x_1) + + sub_p256(yd,y2a,y_1) + + montsqr_p256(zz,xd) + montsqr_p256(ww,yd) + + montmul_p256(zzx1,zz,x_1) + montmul_p256(zzx2,zz,x2a) + + sub_p256(resx,ww,zzx1) + sub_p256(t1,zzx2,zzx1) + + montmul_p256(resz,xd,z_1) + + sub_p256(resx,resx,zzx2) + + sub_p256(t2,zzx1,resx) + + montmul_p256(t1,t1,y_1) + montmul_p256(t2,yd,t2) + + sub_p256(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + testzero4(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^256 - p_256), +// hence giving 0 + p2 = p2 for the final result. + + mux4(%r8,%r9,%r10,%r11,resx,x_2) + mux4(%r12,%r13,%r14,%r15,resy,y_2) + + store4(x_3,%r8,%r9,%r10,%r11) + store4(y_3,%r12,%r13,%r14,%r15) + + load4(%r8,%r9,%r10,%r11,resz) + movl $1, %eax + cmovzq %rax, %r8 + movq $0xffffffff00000000, %rax + cmovzq %rax, %r9 + movq $0xffffffffffffffff, %rax + cmovzq %rax, %r10 + movl $0x00000000fffffffe, %eax + cmovzq %rax, %r11 + + store4(z_3,%r8,%r9,%r10,%r11) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjscalarmul.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjscalarmul.S index 4569646cd31..1904ca193eb 100644 --- a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjscalarmul.S @@ -76,6 +76,7 @@ cmovzq TAB+96*(I-1)+88(%rsp), %r15 S2N_BN_SYMBOL(p256_montjscalarmul): + _CET_ENDBR // The Windows version literally calls the standard ABI version. // This simplifies the proofs since subroutine offsets are fixed. diff --git a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjscalarmul_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjscalarmul_alt.S index b68d857e76b..1124a046fe1 100644 --- a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_montjscalarmul_alt.S @@ -76,6 +76,7 @@ cmovzq TAB+96*(I-1)+88(%rsp), %r15 S2N_BN_SYMBOL(p256_montjscalarmul_alt): + _CET_ENDBR // The Windows version literally calls the standard ABI version. // This simplifies the proofs since subroutine offsets are fixed. diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmul.S new file mode 100644 index 00000000000..3e426269977 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmul.S @@ -0,0 +1,6802 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Scalar multiplication for P-256 +// Input scalar[4], point[8]; output res[8] +// +// extern void p256_scalarmul +// (uint64_t res[static 8],uint64_t scalar[static 4], +// uint64_t point[static 8]); +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-256, returns the point (X,Y) = n * P. The input and output +// are affine points, and in the case of the point at infinity as +// the result, (0,0) is returned. +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmul) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Intermediate variables on the stack. The last z2, z3 values can +// safely be overlaid on the table, which is no longer needed at the end. +// Uppercase syntactic variants make x86_att version simpler to generate + +#define SCALARB (0*NUMSIZE) +#define scalarb (0*NUMSIZE)(%rsp) +#define ACC (1*NUMSIZE) +#define acc (1*NUMSIZE)(%rsp) +#define TABENT (4*NUMSIZE) +#define tabent (4*NUMSIZE)(%rsp) + +#define TAB (7*NUMSIZE) +#define tab (7*NUMSIZE)(%rsp) + +#define Z2 (7*NUMSIZE) +#define z2 (7*NUMSIZE)(%rsp) +#define Z3 (8*NUMSIZE) +#define z3 (8*NUMSIZE)(%rsp) + +#define res (31*NUMSIZE)(%rsp) + +#define NSPACE (32*NUMSIZE) + +S2N_BN_SYMBOL(p256_scalarmul): + _CET_ENDBR + +// The Windows version literally calls the standard ABI version. +// This simplifies the proofs since subroutine offsets are fixed. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + callq p256_scalarmul_standard + popq %rsi + popq %rdi + ret + +p256_scalarmul_standard: +#endif + +// Real start of the standard ABI code. + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + subq $NSPACE, %rsp + +// Preserve the "res" and "point" input arguments. We load and process the +// scalar immediately so we don't bother preserving that input argument. +// Also, "point" is only needed early on and so its register gets re-used. + + movq %rdx, %rbx + movq %rdi, res + +// Load the digits of group order n_256 = [%r15;%r14;%r13;%r12] + + movq $0xf3b9cac2fc632551, %r12 + movq $0xbce6faada7179e84, %r13 + movq $0xffffffffffffffff, %r14 + movq $0xffffffff00000000, %r15 + +// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256 + + movq (%rsi), %r8 + subq %r12, %r8 + movq 8(%rsi), %r9 + sbbq %r13, %r9 + movq 16(%rsi), %r10 + sbbq %r14, %r10 + movq 24(%rsi), %r11 + sbbq %r15, %r11 + + cmovcq (%rsi), %r8 + cmovcq 8(%rsi), %r9 + cmovcq 16(%rsi), %r10 + cmovcq 24(%rsi), %r11 + +// Now if the top bit of the reduced scalar is set, negate it mod n_256, +// i.e. do n |-> n_256 - n. Remember the sign in %rbp so we can +// correspondingly negate the point below. + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + movq %r11, %rbp + shrq $63, %rbp + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + +// In either case then add the recoding constant 0x08888...888 to allow +// signed digits. + + movq $0x8888888888888888, %rax + addq %rax, %r8 + adcq %rax, %r9 + adcq %rax, %r10 + adcq %rax, %r11 + btc $63, %r11 + + movq %r8, SCALARB(%rsp) + movq %r9, SCALARB+8(%rsp) + movq %r10, SCALARB+16(%rsp) + movq %r11, SCALARB+24(%rsp) + +// Set the tab[0] table entry to Montgomery-Jacobian point = 1 * P +// The z coordinate is just the Montgomery form of the constant 1. + + leaq TAB(%rsp), %rdi + movq %rbx, %rsi + callq p256_scalarmul_local_tomont_p256 + + leaq 32(%rbx), %rsi + leaq TAB+32(%rsp), %rdi + callq p256_scalarmul_local_tomont_p256 + + movl $1, %eax + movq %rax, TAB+64(%rsp) + movq $0xffffffff00000000, %rdx + movq %rdx, TAB+72(%rsp) + subq $2, %rax + movq %rax, TAB+80(%rsp) + movq $0x00000000fffffffe, %rax + movq %rax, TAB+88(%rsp) + +// If the top bit of the scalar was set, negate (y coordinate of) the point + + movq TAB+32(%rsp), %r12 + movq TAB+40(%rsp), %r13 + movq TAB+48(%rsp), %r14 + movq TAB+56(%rsp), %r15 + + xorl %r10d, %r10d + leaq -1(%r10), %r8 + movq $0x00000000ffffffff, %r11 + movq %r11, %r9 + negq %r11 + + subq %r12, %r8 + sbbq %r13, %r9 + sbbq %r14, %r10 + sbbq %r15, %r11 + + testq %rbp, %rbp + cmovzq %r12, %r8 + cmovzq %r13, %r9 + cmovzq %r14, %r10 + cmovzq %r15, %r11 + + movq %r8, TAB+32(%rsp) + movq %r9, TAB+40(%rsp) + movq %r10, TAB+48(%rsp) + movq %r11, TAB+56(%rsp) + +// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P + + leaq TAB+96*1(%rsp), %rdi + leaq TAB(%rsp), %rsi + callq p256_scalarmul_local_p256_montjdouble + + leaq TAB+96*2(%rsp), %rdi + leaq TAB+96*1(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p256_scalarmul_local_p256_montjmixadd + + leaq TAB+96*3(%rsp), %rdi + leaq TAB+96*1(%rsp), %rsi + callq p256_scalarmul_local_p256_montjdouble + + leaq TAB+96*4(%rsp), %rdi + leaq TAB+96*3(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p256_scalarmul_local_p256_montjmixadd + + leaq TAB+96*5(%rsp), %rdi + leaq TAB+96*2(%rsp), %rsi + callq p256_scalarmul_local_p256_montjdouble + + leaq TAB+96*6(%rsp), %rdi + leaq TAB+96*5(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p256_scalarmul_local_p256_montjmixadd + + leaq TAB+96*7(%rsp), %rdi + leaq TAB+96*3(%rsp), %rsi + callq p256_scalarmul_local_p256_montjdouble + +// Set up accumulator as table entry for top 4 bits (constant-time indexing) + + movq SCALARB+24(%rsp), %rdi + shrq $60, %rdi + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + .set I, 1 +.rep 8 + cmpq $I, %rdi + + cmovzq TAB+96*(I-1)(%rsp), %rax + cmovzq TAB+96*(I-1)+8(%rsp), %rbx + cmovzq TAB+96*(I-1)+16(%rsp), %rcx + cmovzq TAB+96*(I-1)+24(%rsp), %rdx + cmovzq TAB+96*(I-1)+32(%rsp), %r8 + cmovzq TAB+96*(I-1)+40(%rsp), %r9 + cmovzq TAB+96*(I-1)+48(%rsp), %r10 + cmovzq TAB+96*(I-1)+56(%rsp), %r11 + cmovzq TAB+96*(I-1)+64(%rsp), %r12 + cmovzq TAB+96*(I-1)+72(%rsp), %r13 + cmovzq TAB+96*(I-1)+80(%rsp), %r14 + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + .set I, (I+1) +.endr + movq %rax, ACC(%rsp) + movq %rbx, ACC+8(%rsp) + movq %rcx, ACC+16(%rsp) + movq %rdx, ACC+24(%rsp) + movq %r8, ACC+32(%rsp) + movq %r9, ACC+40(%rsp) + movq %r10, ACC+48(%rsp) + movq %r11, ACC+56(%rsp) + movq %r12, ACC+64(%rsp) + movq %r13, ACC+72(%rsp) + movq %r14, ACC+80(%rsp) + movq %r15, ACC+88(%rsp) + +// Main loop over size-4 bitfield + + movl $252, %ebp + +p256_scalarmul_loop: + subq $4, %rbp + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p256_scalarmul_local_p256_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p256_scalarmul_local_p256_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p256_scalarmul_local_p256_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p256_scalarmul_local_p256_montjdouble + + movq %rbp, %rax + shrq $6, %rax + movq (%rsp,%rax,8), %rdi + movq %rbp, %rcx + shrq %cl, %rdi + andq $15, %rdi + + subq $8, %rdi + sbbq %rsi, %rsi // %rsi = sign of digit (-1 = negative) + xorq %rsi, %rdi + subq %rsi, %rdi // %rdi = absolute value of digit + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + .set I, 1 +.rep 8 + cmpq $I, %rdi + + cmovzq TAB+96*(I-1)(%rsp), %rax + cmovzq TAB+96*(I-1)+8(%rsp), %rbx + cmovzq TAB+96*(I-1)+16(%rsp), %rcx + cmovzq TAB+96*(I-1)+24(%rsp), %rdx + cmovzq TAB+96*(I-1)+32(%rsp), %r8 + cmovzq TAB+96*(I-1)+40(%rsp), %r9 + cmovzq TAB+96*(I-1)+48(%rsp), %r10 + cmovzq TAB+96*(I-1)+56(%rsp), %r11 + cmovzq TAB+96*(I-1)+64(%rsp), %r12 + cmovzq TAB+96*(I-1)+72(%rsp), %r13 + cmovzq TAB+96*(I-1)+80(%rsp), %r14 + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + .set I, (I+1) +.endr + + movq %r12, TABENT+64(%rsp) + movq %r13, TABENT+72(%rsp) + movq %r14, TABENT+80(%rsp) + movq %r15, TABENT+88(%rsp) + + xorl %r14d, %r14d + leaq -1(%r14), %r12 + movq $0x00000000ffffffff, %r15 + movq %r15, %r13 + negq %r15 + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + testq %rsi, %rsi + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + + movq %r8, TABENT+32(%rsp) + movq %r9, TABENT+40(%rsp) + movq %r10, TABENT+48(%rsp) + movq %r11, TABENT+56(%rsp) + + leaq TABENT(%rsp), %rdx + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p256_scalarmul_local_p256_montjadd + + testq %rbp, %rbp + jne p256_scalarmul_loop + +// Let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form + + leaq Z2(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + callq p256_scalarmul_local_montsqr_p256 + + leaq Z3(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + leaq Z2(%rsp), %rdx + callq p256_scalarmul_local_montmul_p256 + + leaq Z2(%rsp), %rdi + leaq Z3(%rsp), %rsi + callq p256_scalarmul_local_demont_p256 + + leaq Z3(%rsp), %rdi + leaq Z2(%rsp), %rsi + callq p256_scalarmul_local_inv_p256 + + leaq Z2(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + leaq Z3(%rsp), %rdx + callq p256_scalarmul_local_montmul_p256 + +// Convert back from Jacobian (X, Y, Z) |-> (X/Z^2, Y/Z^3) + + movq res, %rdi + leaq ACC(%rsp), %rsi + leaq Z2(%rsp), %rdx + movq %rdi, %rbx + callq p256_scalarmul_local_montmul_p256 + + leaq 32(%rbx), %rdi + leaq ACC+32(%rsp), %rsi + leaq Z3(%rsp), %rdx + callq p256_scalarmul_local_montmul_p256 + +// Restore stack and registers and return + + addq $NSPACE, %rsp + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + ret + +// Local copies of subroutines, complete clones at the moment + +p256_scalarmul_local_demont_p256: + pushq %rbx + movq (%rsi), %r8 + movq 0x8(%rsi), %r9 + movq 0x10(%rsi), %r10 + movq 0x18(%rsi), %r11 + xorq %rbx, %rbx + xorq %rsi, %rsi + movq $0x100000000, %rdx + mulxq %r8, %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq %r9, %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %rbx + mulxq %r9, %rax, %rcx + adcxq %rax, %rbx + adoxq %rcx, %rsi + movl $0x0, %r8d + adcxq %r8, %rsi + xorq %r9, %r9 + movq $0x100000000, %rdx + mulxq %r10, %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %rbx + mulxq %r11, %rax, %rcx + adcxq %rax, %rbx + adoxq %rcx, %rsi + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rcx + adcxq %rax, %rsi + adoxq %rcx, %r8 + mulxq %r11, %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movl $0x0, %r10d + adcxq %r10, %r9 + movq %rbx, (%rdi) + movq %rsi, 0x8(%rdi) + movq %r8, 0x10(%rdi) + movq %r9, 0x18(%rdi) + popq %rbx + ret + +p256_scalarmul_local_inv_p256: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xf0, %rsp + movq %rdi, 0xe0(%rsp) + xorl %ecx, %ecx + movl $0xffffffff, %edx + movq %rdx, %rbx + leaq -0x1(%rcx), %rax + negq %rdx + movq %rax, (%rsp) + movq %rbx, 0x8(%rsp) + movq %rcx, 0x10(%rsp) + movq %rdx, 0x18(%rsp) + movq %rcx, 0x20(%rsp) + movq (%rsi), %r8 + movq 0x8(%rsi), %r9 + movq 0x10(%rsi), %r10 + movq 0x18(%rsi), %r11 + leaq 0x1(%rcx), %rax + addq %r8, %rax + leaq -0x1(%rdx), %rbx + adcq %r9, %rbx + notq %rcx + adcq %r10, %rcx + notq %rdx + adcq %r11, %rdx + cmovaeq %r8, %rax + cmovaeq %r9, %rbx + cmovaeq %r10, %rcx + cmovaeq %r11, %rdx + movq %rax, 0x28(%rsp) + movq %rbx, 0x30(%rsp) + movq %rcx, 0x38(%rsp) + movq %rdx, 0x40(%rsp) + xorl %eax, %eax + movq %rax, 0x48(%rsp) + xorl %eax, %eax + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movq %rax, 0x60(%rsp) + movq %rax, 0x68(%rsp) + movq $0x4000000000000, %rcx + movq %rcx, 0x78(%rsp) + movq %rax, 0x80(%rsp) + movq %rax, 0x88(%rsp) + movq %rax, 0x90(%rsp) + movq $0xa, 0xb0(%rsp) + movq $0x1, 0xb8(%rsp) + jmp p256_scalarmul_inv_midloop +p256_scalarmul_inv_loop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0xa0(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0xa8(%rsp) + xorl %ebx, %ebx + movq (%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + xorl %ecx, %ecx + movq 0x8(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x28(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x38(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x30(%rsp) + movq 0x18(%rsp), %rax + xorq %r9, %rax + movq 0x20(%rsp), %rbp + xorq %r9, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x40(%rsp), %rax + xorq %r11, %rax + movq 0x48(%rsp), %rdx + xorq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + sarq $0x3b, %rbp + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + movq 0x20(%rsp), %rsi + movq %rbp, 0x20(%rsp) + xorq %r13, %rax + xorq %r13, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x40(%rsp), %rax + xorq %r15, %rax + movq 0x48(%rsp), %rdx + xorq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x38(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x40(%rsp) + sarq $0x3b, %rsi + movq %rsi, 0x48(%rsp) + movq 0xa0(%rsp), %rbx + movq 0xa8(%rsp), %rbp + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x78(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x78(%rsp) + xorl %ebx, %ebx + movq 0x58(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x80(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x58(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x58(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x80(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x80(%rsp) + xorl %ecx, %ecx + movq 0x60(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x88(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x60(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x60(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x88(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x88(%rsp) + movq 0x68(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x90(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq 0x68(%rsp), %rax + movq %rcx, 0x68(%rsp) + movq %rdx, 0x70(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq 0x90(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rsi, 0x90(%rsp) + movq %rdx, 0x98(%rsp) + movq $0xe000000000000000, %r8 + addq 0x50(%rsp), %r8 + movq $0xffffffffffffffff, %r9 + adcq 0x58(%rsp), %r9 + movq $0x1fffffff, %r10 + adcq 0x60(%rsp), %r10 + movq $0x2000000000000000, %r11 + adcq 0x68(%rsp), %r11 + movq $0x1fffffffe0000000, %r12 + adcq 0x70(%rsp), %r12 + movq %r8, %rbx + shlq $0x20, %rbx + movq $0xffffffff00000001, %rax + mulq %r8 + shrq $0x20, %r8 + addq %rbx, %r9 + adcq %r8, %r10 + adcq %rax, %r11 + adcq %rdx, %r12 + sbbq %rax, %rax + movl $0xffffffff, %ebx + andq %rax, %rbx + movq $0xffffffff00000001, %rdx + andq %rax, %rdx + subq %rax, %r9 + movq %r9, 0x50(%rsp) + sbbq %rbx, %r10 + movq %r10, 0x58(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x60(%rsp) + sbbq %rdx, %r12 + movq %r12, 0x68(%rsp) + movq $0xe000000000000000, %r8 + addq 0x78(%rsp), %r8 + movq $0xffffffffffffffff, %r9 + adcq 0x80(%rsp), %r9 + movq $0x1fffffff, %r10 + adcq 0x88(%rsp), %r10 + movq $0x2000000000000000, %r11 + adcq 0x90(%rsp), %r11 + movq $0x1fffffffe0000000, %r12 + adcq 0x98(%rsp), %r12 + movq %r8, %rbx + shlq $0x20, %rbx + movq $0xffffffff00000001, %rax + mulq %r8 + shrq $0x20, %r8 + addq %rbx, %r9 + adcq %r8, %r10 + adcq %rax, %r11 + adcq %rdx, %r12 + sbbq %rax, %rax + movl $0xffffffff, %ebx + andq %rax, %rbx + movq $0xffffffff00000001, %rdx + andq %rax, %rdx + subq %rax, %r9 + movq %r9, 0x78(%rsp) + sbbq %rbx, %r10 + movq %r10, 0x80(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x88(%rsp) + sbbq %rdx, %r12 + movq %r12, 0x90(%rsp) +p256_scalarmul_inv_midloop: + movq 0xb8(%rsp), %rsi + movq (%rsp), %rdx + movq 0x28(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xc0(%rsp) + movq %rbx, 0xc8(%rsp) + movq %rdi, 0xd0(%rsp) + movq %rcx, 0xd8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x28(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xc0(%rsp), %rax + imulq %r8, %rax + movq 0xd0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xc8(%rsp), %r8 + imulq 0xd8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xc0(%rsp), %rax + imulq %r10, %rax + movq 0xd0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xc8(%rsp), %r10 + imulq 0xd8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0xb8(%rsp) + decq 0xb0(%rsp) + jne p256_scalarmul_inv_loop + movq (%rsp), %rax + movq 0x28(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x78(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x58(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x80(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x88(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x68(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x90(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r12, 0x50(%rsp) + movq %r13, 0x58(%rsp) + movq %r14, 0x60(%rsp) + movq %r15, 0x68(%rsp) + movq %r9, 0x70(%rsp) + movq $0xe000000000000000, %r8 + addq 0x50(%rsp), %r8 + movq $0xffffffffffffffff, %r9 + adcq 0x58(%rsp), %r9 + movq $0x1fffffff, %r10 + adcq 0x60(%rsp), %r10 + movq $0x2000000000000000, %r11 + adcq 0x68(%rsp), %r11 + movq $0x1fffffffe0000000, %r12 + adcq 0x70(%rsp), %r12 + movq %r8, %rbx + shlq $0x20, %rbx + movq $0xffffffff00000001, %rax + mulq %r8 + shrq $0x20, %r8 + addq %rbx, %r9 + adcq %r8, %r10 + adcq %rax, %r11 + adcq %rdx, %r12 + sbbq %rax, %rax + movl $0xffffffff, %ebx + andq %rax, %rbx + movq $0xffffffff00000001, %rdx + andq %rax, %rdx + subq %rax, %r9 + movq %r9, 0x50(%rsp) + sbbq %rbx, %r10 + movq %r10, 0x58(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x60(%rsp) + sbbq %rdx, %r12 + movq %r12, 0x68(%rsp) + movq 0x50(%rsp), %r8 + movq 0x58(%rsp), %r9 + movq 0x60(%rsp), %r10 + movq 0x68(%rsp), %r11 + movl $0x1, %eax + movl $0xffffffff, %ebx + leaq -0x2(%rax), %rcx + leaq -0x1(%rbx), %rdx + notq %rbx + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + cmovaeq %r8, %rax + cmovaeq %r9, %rbx + cmovaeq %r10, %rcx + cmovaeq %r11, %rdx + movq 0xe0(%rsp), %rdi + movq %rax, (%rdi) + movq %rbx, 0x8(%rdi) + movq %rcx, 0x10(%rdi) + movq %rdx, 0x18(%rdi) + addq $0xf0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p256_scalarmul_local_montmul_p256: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx, %rcx + xorl %r13d, %r13d + movq (%rcx), %rdx + mulxq (%rsi), %r8, %r9 + mulxq 0x8(%rsi), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsi), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsi), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x8(%rcx), %rdx + xorl %r14d, %r14d + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x10(%rcx), %rdx + xorl %r8d, %r8d + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x18(%rsi), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x18(%rcx), %rdx + xorl %r9d, %r9d + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x18(%rsi), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret + +p256_scalarmul_local_montsqr_p256: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq (%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x8(%rsi), %r9, %r10 + mulxq 0x18(%rsi), %r11, %r12 + movq 0x10(%rsi), %rdx + mulxq 0x18(%rsi), %r13, %r14 + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x18(%rsi), %rdx + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x8(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x10(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x18(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + movl %ebp, %r9d + adoxq %rbp, %r9 + adcxq %rbp, %r9 + addq %r9, %r14 + adcq %rbp, %r15 + movl %ebp, %r8d + adcq %rbp, %r8 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rbp, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbp, %r14 + cmovbq %r11, %r15 + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p256_scalarmul_local_tomont_p256: + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + xorq %r13, %r13 + movl $0x3, %edx + mulxq (%rsi), %r8, %r9 + mulxq 0x8(%rsi), %rcx, %r10 + adcxq %rcx, %r9 + mulxq 0x10(%rsi), %rcx, %r11 + adcxq %rcx, %r10 + mulxq 0x18(%rsi), %rcx, %r12 + adcxq %rcx, %r11 + adcxq %r13, %r12 + movq $0xfffffffbffffffff, %rdx + xorq %r14, %r14 + mulxq (%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq 0x8(%rsi), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq 0x10(%rsi), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + mulxq 0x18(%rsi), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcq %r14, %r13 + xorq %r15, %r15 + movq $0x100000000, %rdx + mulxq %r8, %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq %r9, %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + mulxq %r9, %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcxq %r15, %r14 + movq $0xfffffffffffffffe, %rdx + xorq %r8, %r8 + mulxq (%rsi), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq 0x8(%rsi), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + mulxq 0x10(%rsi), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + mulxq 0x18(%rsi), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %r8, %r14 + adoxq %r8, %r15 + adcxq %r8, %r15 + movq $0x4fffffffd, %rdx + xorq %r9, %r9 + mulxq (%rsi), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + mulxq 0x8(%rsi), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + mulxq 0x10(%rsi), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + mulxq 0x18(%rsi), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcxq %r9, %r8 + xorq %r9, %r9 + movq $0x100000000, %rdx + mulxq %r10, %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + mulxq %r11, %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + mulxq %r11, %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcxq %r9, %r8 + movl $0xffffffff, %edx + movq $0xffffffff00000001, %rcx + movq $0xfffffffffffffffe, %rax + subq %r12, %rax + movq %rdx, %rax + sbbq %r13, %rax + movl $0x0, %eax + sbbq %r14, %rax + movq %rcx, %rax + sbbq %r15, %rax + movl $0x0, %eax + sbbq %r8, %rax + andq %rax, %rdx + andq %rax, %rcx + subq %rax, %r12 + sbbq %rdx, %r13 + sbbq $0x0, %r14 + sbbq %rcx, %r15 + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + ret + +p256_scalarmul_local_p256_montjadd: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xe0, %rsp + movq %rdx, %rbp + movq 0x40(%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rsi), %r9, %r10 + mulxq 0x58(%rsi), %r11, %r12 + movq 0x50(%rsi), %rdx + mulxq 0x58(%rsi), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rsi), %rdx + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %r8d + leaq -0x1(%rdx), %rdx + leaq -0x1(%rcx), %rax + movl $0xfffffffe, %r11d + cmoveq %rcx, %r8 + cmoveq %rcx, %rdx + cmoveq %rcx, %rax + cmoveq %rcx, %r11 + addq %r8, %r12 + adcq %rdx, %r13 + adcq %rax, %r14 + adcq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x40(%rbp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rbp), %r9, %r10 + mulxq 0x58(%rbp), %r11, %r12 + movq 0x50(%rbp), %rdx + mulxq 0x58(%rbp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x40(%rbp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rbp), %rdx + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rbp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rbp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rbp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %r8d + leaq -0x1(%rdx), %rdx + leaq -0x1(%rcx), %rax + movl $0xfffffffe, %r11d + cmoveq %rcx, %r8 + cmoveq %rcx, %rdx + cmoveq %rcx, %rax + cmoveq %rcx, %r11 + addq %r8, %r12 + adcq %rdx, %r13 + adcq %rax, %r14 + adcq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + xorl %r13d, %r13d + movq 0x20(%rsi), %rdx + mulxq 0x40(%rbp), %r8, %r9 + mulxq 0x48(%rbp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x50(%rbp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x58(%rbp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rsi), %rdx + xorl %r14d, %r14d + mulxq 0x40(%rbp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x50(%rbp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x58(%rbp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rsi), %rdx + xorl %r8d, %r8d + mulxq 0x40(%rbp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x50(%rbp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x58(%rbp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rsi), %rdx + xorl %r9d, %r9d + mulxq 0x40(%rbp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x50(%rbp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x58(%rbp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xc0(%rsp) + movq %r13, 0xc8(%rsp) + movq %r14, 0xd0(%rsp) + movq %r15, 0xd8(%rsp) + xorl %r13d, %r13d + movq 0x20(%rbp), %rdx + mulxq 0x40(%rsi), %r8, %r9 + mulxq 0x48(%rsi), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x50(%rsi), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x58(%rsi), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rbp), %rdx + xorl %r14d, %r14d + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x58(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rbp), %rdx + xorl %r8d, %r8d + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x58(%rsi), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rbp), %rdx + xorl %r9d, %r9d + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x58(%rsi), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + xorl %r13d, %r13d + movq 0x0(%rbp), %rdx + mulxq (%rsp), %r8, %r9 + mulxq 0x8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x8(%rbp), %rdx + xorl %r14d, %r14d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x10(%rbp), %rdx + xorl %r8d, %r8d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x18(%rbp), %rdx + xorl %r9d, %r9d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + xorl %r13d, %r13d + movq (%rsi), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0xb0(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0xb8(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x8(%rsi), %rdx + xorl %r14d, %r14d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x10(%rsi), %rdx + xorl %r8d, %r8d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x18(%rsi), %rdx + xorl %r9d, %r9d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + xorl %r13d, %r13d + movq 0x20(%rsp), %rdx + mulxq (%rsp), %r8, %r9 + mulxq 0x8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rsp), %rdx + xorl %r14d, %r14d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rsp), %rdx + xorl %r8d, %r8d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rsp), %rdx + xorl %r9d, %r9d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + xorl %r13d, %r13d + movq 0xc0(%rsp), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0xb0(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0xb8(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0xc8(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0xd0(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0xd8(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xc0(%rsp) + movq %r13, 0xc8(%rsp) + movq %r14, 0xd0(%rsp) + movq %r15, 0xd8(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0xa0(%rsp) + adcq %r10, %rcx + movq %rcx, 0xa8(%rsp) + adcq $0x0, %r8 + movq %r8, 0xb0(%rsp) + adcq %rdx, %r9 + movq %r9, 0xb8(%rsp) + movq 0x20(%rsp), %rax + subq 0xc0(%rsp), %rax + movq 0x28(%rsp), %rcx + sbbq 0xc8(%rsp), %rcx + movq 0x30(%rsp), %r8 + sbbq 0xd0(%rsp), %r8 + movq 0x38(%rsp), %r9 + sbbq 0xd8(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x20(%rsp) + adcq %r10, %rcx + movq %rcx, 0x28(%rsp) + adcq $0x0, %r8 + movq %r8, 0x30(%rsp) + adcq %rdx, %r9 + movq %r9, 0x38(%rsp) + movq 0xa0(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0xa8(%rsp), %r9, %r10 + mulxq 0xb8(%rsp), %r11, %r12 + movq 0xb0(%rsp), %rdx + mulxq 0xb8(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0xb8(%rsp), %rdx + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0xa8(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0xb0(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0xb8(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %r8d + leaq -0x1(%rdx), %rdx + leaq -0x1(%rcx), %rax + movl $0xfffffffe, %r11d + cmoveq %rcx, %r8 + cmoveq %rcx, %rdx + cmoveq %rcx, %rax + cmoveq %rcx, %r11 + addq %r8, %r12 + adcq %rdx, %r13 + adcq %rax, %r14 + adcq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x20(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x28(%rsp), %r9, %r10 + mulxq 0x38(%rsp), %r11, %r12 + movq 0x30(%rsp), %rdx + mulxq 0x38(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x38(%rsp), %rdx + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x28(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x30(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x38(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %ebx + addq %r12, %rbx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rcx), %rcx + movq %rcx, %rax + adcq %r14, %rcx + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rbx, %r12 + cmovbq %rdx, %r13 + cmovbq %rcx, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + xorl %r13d, %r13d + movq 0x80(%rsp), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x70(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x78(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x88(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x90(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x98(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + xorl %r13d, %r13d + movq 0x40(%rsp), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x70(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x78(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x48(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x50(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x58(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq (%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq $0x0, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + xorl %r13d, %r13d + movq 0x40(%rsi), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0xb0(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0xb8(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x48(%rsi), %rdx + xorl %r14d, %r14d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x50(%rsi), %rdx + xorl %r8d, %r8d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x58(%rsi), %rdx + xorl %r9d, %r9d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq (%rsp), %rax + subq 0x40(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x48(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x50(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x58(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x80(%rsp), %rax + subq (%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + xorl %r13d, %r13d + movq 0xc0(%rsp), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x70(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x78(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0xc8(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0xd0(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0xd8(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + xorl %r13d, %r13d + movq 0x40(%rbp), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0xb0(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0xb8(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x48(%rbp), %rdx + xorl %r14d, %r14d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x50(%rbp), %rdx + xorl %r8d, %r8d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x58(%rbp), %rdx + xorl %r9d, %r9d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + xorl %r13d, %r13d + movq 0x80(%rsp), %rdx + mulxq 0x20(%rsp), %r8, %r9 + mulxq 0x28(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x30(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x38(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x88(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x38(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x90(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x38(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x98(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x38(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x80(%rsp), %rax + subq 0x60(%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x68(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x70(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x78(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x40(%rsi), %r8 + movq 0x48(%rsi), %r9 + movq 0x50(%rsi), %r10 + movq 0x58(%rsi), %r11 + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + movq 0x40(%rbp), %r12 + movq 0x48(%rbp), %r13 + movq 0x50(%rbp), %r14 + movq 0x58(%rbp), %r15 + movq %r12, %rbx + movq %r13, %rdx + orq %r14, %rbx + orq %r15, %rdx + orq %rdx, %rbx + negq %rbx + sbbq %rbx, %rbx + cmpq %rax, %rbx + cmovbq %r8, %r12 + cmovbq %r9, %r13 + cmovbq %r10, %r14 + cmovbq %r11, %r15 + cmoveq 0xa0(%rsp), %r12 + cmoveq 0xa8(%rsp), %r13 + cmoveq 0xb0(%rsp), %r14 + cmoveq 0xb8(%rsp), %r15 + movq (%rsp), %rax + cmovbq (%rsi), %rax + cmova 0x0(%rbp), %rax + movq 0x8(%rsp), %rbx + cmovbq 0x8(%rsi), %rbx + cmova 0x8(%rbp), %rbx + movq 0x10(%rsp), %rcx + cmovbq 0x10(%rsi), %rcx + cmova 0x10(%rbp), %rcx + movq 0x18(%rsp), %rdx + cmovbq 0x18(%rsi), %rdx + cmova 0x18(%rbp), %rdx + movq 0x80(%rsp), %r8 + cmovbq 0x20(%rsi), %r8 + cmova 0x20(%rbp), %r8 + movq 0x88(%rsp), %r9 + cmovbq 0x28(%rsi), %r9 + cmova 0x28(%rbp), %r9 + movq 0x90(%rsp), %r10 + cmovbq 0x30(%rsi), %r10 + cmova 0x30(%rbp), %r10 + movq 0x98(%rsp), %r11 + cmovbq 0x38(%rsi), %r11 + cmova 0x38(%rbp), %r11 + movq %rax, (%rdi) + movq %rbx, 0x8(%rdi) + movq %rcx, 0x10(%rdi) + movq %rdx, 0x18(%rdi) + movq %r8, 0x20(%rdi) + movq %r9, 0x28(%rdi) + movq %r10, 0x30(%rdi) + movq %r11, 0x38(%rdi) + movq %r12, 0x40(%rdi) + movq %r13, 0x48(%rdi) + movq %r14, 0x50(%rdi) + movq %r15, 0x58(%rdi) + addq $0xe0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p256_scalarmul_local_p256_montjdouble: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xc0, %rsp + movq 0x40(%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rsi), %r9, %r10 + mulxq 0x58(%rsi), %r11, %r12 + movq 0x50(%rsi), %rdx + mulxq 0x58(%rsi), %r13, %r14 + xorl %ebp, %ebp + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rsi), %rdx + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + movl %ebp, %r9d + adoxq %rbp, %r9 + adcxq %rbp, %r9 + addq %r9, %r14 + adcq %rbp, %r15 + movl %ebp, %r8d + adcq %rbp, %r8 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rbp, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbp, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x20(%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x28(%rsi), %r9, %r10 + mulxq 0x38(%rsi), %r11, %r12 + movq 0x30(%rsi), %rdx + mulxq 0x38(%rsi), %r13, %r14 + xorl %ebp, %ebp + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x38(%rsi), %rdx + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x28(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x30(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x38(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + movl %ebp, %r9d + adoxq %rbp, %r9 + adcxq %rbp, %r9 + addq %r9, %r14 + adcq %rbp, %r15 + movl %ebp, %r8d + adcq %rbp, %r8 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rbp, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbp, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq (%rsi), %rax + subq (%rsp), %rax + movq 0x8(%rsi), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x10(%rsi), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x18(%rsi), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq $0x0, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + movq (%rsi), %rax + addq (%rsp), %rax + movq 0x8(%rsi), %rcx + adcq 0x8(%rsp), %rcx + movq 0x10(%rsi), %r8 + adcq 0x10(%rsp), %r8 + movq 0x18(%rsi), %r9 + adcq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + subq %r11, %rax + movq %rax, 0x40(%rsp) + sbbq %r10, %rcx + movq %rcx, 0x48(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x50(%rsp) + sbbq %rdx, %r9 + movq %r9, 0x58(%rsp) + xorl %r13d, %r13d + movq 0x60(%rsp), %rdx + mulxq 0x40(%rsp), %r8, %r9 + mulxq 0x48(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x50(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x58(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x68(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x40(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x50(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x58(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x70(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x40(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x50(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x58(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x78(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x40(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x50(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x58(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + xorq %r11, %r11 + movq 0x20(%rsi), %rax + addq 0x40(%rsi), %rax + movq 0x28(%rsi), %rcx + adcq 0x48(%rsi), %rcx + movq 0x30(%rsi), %r8 + adcq 0x50(%rsi), %r8 + movq 0x38(%rsi), %r9 + adcq 0x58(%rsi), %r9 + adcq %r11, %r11 + subq $0xffffffffffffffff, %rax + movl $0xffffffff, %r10d + sbbq %r10, %rcx + sbbq $0x0, %r8 + movq $0xffffffff00000001, %rdx + sbbq %rdx, %r9 + sbbq $0x0, %r11 + andq %r11, %r10 + andq %r11, %rdx + addq %r11, %rax + movq %rax, 0x40(%rsp) + adcq %r10, %rcx + movq %rcx, 0x48(%rsp) + adcq $0x0, %r8 + movq %r8, 0x50(%rsp) + adcq %rdx, %r9 + movq %r9, 0x58(%rsp) + xorl %r13d, %r13d + movq 0x20(%rsp), %rdx + mulxq (%rsi), %r8, %r9 + mulxq 0x8(%rsi), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsi), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsi), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rsp), %rdx + xorl %r14d, %r14d + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rsp), %rdx + xorl %r8d, %r8d + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x18(%rsi), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rsp), %rdx + xorl %r9d, %r9d + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x18(%rsi), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x60(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x68(%rsp), %r9, %r10 + mulxq 0x78(%rsp), %r11, %r12 + movq 0x70(%rsp), %rdx + mulxq 0x78(%rsp), %r13, %r14 + xorl %ebp, %ebp + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x78(%rsp), %rdx + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x68(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x70(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x78(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + movl %ebp, %r9d + adoxq %rbp, %r9 + adcxq %rbp, %r9 + addq %r9, %r14 + adcq %rbp, %r15 + movl %ebp, %r8d + adcq %rbp, %r8 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rbp, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbp, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq 0x40(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rsp), %r9, %r10 + mulxq 0x58(%rsp), %r11, %r12 + movq 0x50(%rsp), %rdx + mulxq 0x58(%rsp), %r13, %r14 + xorl %ebp, %ebp + mulxq 0x40(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rsp), %rdx + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + movl %ebp, %r9d + adoxq %rbp, %r9 + adcxq %rbp, %r9 + addq %r9, %r14 + adcq %rbp, %r15 + movl %ebp, %r8d + adcq %rbp, %r8 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rbp, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbp, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq $0xffffffffffffffff, %r8 + xorl %r10d, %r10d + subq 0xa0(%rsp), %r8 + movq $0xffffffff, %r9 + sbbq 0xa8(%rsp), %r9 + sbbq 0xb0(%rsp), %r10 + movq $0xffffffff00000001, %r11 + sbbq 0xb8(%rsp), %r11 + xorl %r12d, %r12d + movq $0x9, %rdx + mulxq %r8, %r8, %rax + mulxq %r9, %r9, %rcx + addq %rax, %r9 + mulxq %r10, %r10, %rax + adcq %rcx, %r10 + mulxq %r11, %r11, %rcx + adcq %rax, %r11 + adcq %rcx, %r12 + movq $0xc, %rdx + xorl %eax, %eax + mulxq 0x80(%rsp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq 0x88(%rsp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq 0x90(%rsp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq 0x98(%rsp), %rax, %rdx + adcxq %rax, %r11 + adoxq %r12, %rdx + adcq $0x1, %rdx + addq %rdx, %r8 + movq $0x100000000, %rax + mulxq %rax, %rax, %rcx + sbbq $0x0, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + movq $0xffffffff00000001, %rax + mulxq %rax, %rax, %rcx + sbbq %rax, %r11 + sbbq %rcx, %rdx + decq %rdx + movl $0xffffffff, %eax + andq %rdx, %rax + xorl %ecx, %ecx + subq %rax, %rcx + addq %rdx, %r8 + movq %r8, 0xa0(%rsp) + adcq %rax, %r9 + movq %r9, 0xa8(%rsp) + adcq $0x0, %r10 + movq %r10, 0xb0(%rsp) + adcq %rcx, %r11 + movq %r11, 0xb8(%rsp) + movq 0x40(%rsp), %rax + subq (%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x40(%rsp) + adcq %r10, %rcx + movq %rcx, 0x48(%rsp) + adcq $0x0, %r8 + movq %r8, 0x50(%rsp) + adcq %rdx, %r9 + movq %r9, 0x58(%rsp) + movq 0x20(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x28(%rsp), %r9, %r10 + mulxq 0x38(%rsp), %r11, %r12 + movq 0x30(%rsp), %rdx + mulxq 0x38(%rsp), %r13, %r14 + xorl %ebp, %ebp + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x38(%rsp), %rdx + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x28(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x30(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x38(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + movl %ebp, %r9d + adoxq %rbp, %r9 + adcxq %rbp, %r9 + addq %r9, %r14 + adcq %rbp, %r15 + movl %ebp, %r8d + adcq %rbp, %r8 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rbp, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbp, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + xorl %r13d, %r13d + movq 0x60(%rsp), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0xb0(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0xb8(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x68(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x70(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x78(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x40(%rsp), %rax + subq 0x20(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x28(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x30(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x38(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x40(%rdi) + adcq %r10, %rcx + movq %rcx, 0x48(%rdi) + adcq $0x0, %r8 + movq %r8, 0x50(%rdi) + adcq %rdx, %r9 + movq %r9, 0x58(%rdi) + movq 0x98(%rsp), %r11 + movq %r11, %rdx + movq 0x90(%rsp), %r10 + shldq $0x2, %r10, %r11 + movq 0x88(%rsp), %r9 + shldq $0x2, %r9, %r10 + movq 0x80(%rsp), %r8 + shldq $0x2, %r8, %r9 + shlq $0x2, %r8 + shrq $0x3e, %rdx + addq $0x1, %rdx + subq 0xa0(%rsp), %r8 + sbbq 0xa8(%rsp), %r9 + sbbq 0xb0(%rsp), %r10 + sbbq 0xb8(%rsp), %r11 + sbbq $0x0, %rdx + addq %rdx, %r8 + movq $0x100000000, %rax + mulxq %rax, %rax, %rcx + sbbq $0x0, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + movq $0xffffffff00000001, %rax + mulxq %rax, %rax, %rcx + sbbq %rax, %r11 + sbbq %rcx, %rdx + decq %rdx + movl $0xffffffff, %eax + andq %rdx, %rax + xorl %ecx, %ecx + subq %rax, %rcx + addq %rdx, %r8 + movq %r8, (%rdi) + adcq %rax, %r9 + movq %r9, 0x8(%rdi) + adcq $0x0, %r10 + movq %r10, 0x10(%rdi) + adcq %rcx, %r11 + movq %r11, 0x18(%rdi) + movq $0xffffffffffffffff, %r8 + xorl %r10d, %r10d + subq (%rsp), %r8 + movq $0xffffffff, %r9 + sbbq 0x8(%rsp), %r9 + sbbq 0x10(%rsp), %r10 + movq $0xffffffff00000001, %r11 + sbbq 0x18(%rsp), %r11 + movq %r11, %r12 + shldq $0x3, %r10, %r11 + shldq $0x3, %r9, %r10 + shldq $0x3, %r8, %r9 + shlq $0x3, %r8 + shrq $0x3d, %r12 + movq $0x3, %rdx + xorl %eax, %eax + mulxq 0x60(%rsp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq 0x68(%rsp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq 0x70(%rsp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq 0x78(%rsp), %rax, %rdx + adcxq %rax, %r11 + adoxq %r12, %rdx + adcq $0x1, %rdx + addq %rdx, %r8 + movq $0x100000000, %rax + mulxq %rax, %rax, %rcx + sbbq $0x0, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + movq $0xffffffff00000001, %rax + mulxq %rax, %rax, %rcx + sbbq %rax, %r11 + sbbq %rcx, %rdx + decq %rdx + movl $0xffffffff, %eax + andq %rdx, %rax + xorl %ecx, %ecx + subq %rax, %rcx + addq %rdx, %r8 + movq %r8, 0x20(%rdi) + adcq %rax, %r9 + movq %r9, 0x28(%rdi) + adcq $0x0, %r10 + movq %r10, 0x30(%rdi) + adcq %rcx, %r11 + movq %r11, 0x38(%rdi) + addq $0xc0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p256_scalarmul_local_p256_montjmixadd: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xc0, %rsp + movq %rdx, %rbp + movq 0x40(%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rsi), %r9, %r10 + mulxq 0x58(%rsi), %r11, %r12 + movq 0x50(%rsi), %rdx + mulxq 0x58(%rsi), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rsi), %rdx + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %r8d + leaq -0x1(%rdx), %rdx + leaq -0x1(%rcx), %rax + movl $0xfffffffe, %r11d + cmoveq %rcx, %r8 + cmoveq %rcx, %rdx + cmoveq %rcx, %rax + cmoveq %rcx, %r11 + addq %r8, %r12 + adcq %rdx, %r13 + adcq %rax, %r14 + adcq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + xorl %r13d, %r13d + movq 0x20(%rbp), %rdx + mulxq 0x40(%rsi), %r8, %r9 + mulxq 0x48(%rsi), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x50(%rsi), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x58(%rsi), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rbp), %rdx + xorl %r14d, %r14d + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x58(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rbp), %rdx + xorl %r8d, %r8d + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x58(%rsi), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rbp), %rdx + xorl %r9d, %r9d + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x58(%rsi), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + xorl %r13d, %r13d + movq 0x0(%rbp), %rdx + mulxq (%rsp), %r8, %r9 + mulxq 0x8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x8(%rbp), %rdx + xorl %r14d, %r14d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x10(%rbp), %rdx + xorl %r8d, %r8d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x18(%rbp), %rdx + xorl %r9d, %r9d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + xorl %r13d, %r13d + movq 0x20(%rsp), %rdx + mulxq (%rsp), %r8, %r9 + mulxq 0x8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rsp), %rdx + xorl %r14d, %r14d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rsp), %rdx + xorl %r8d, %r8d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rsp), %rdx + xorl %r9d, %r9d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq 0x40(%rsp), %rax + subq (%rsi), %rax + movq 0x48(%rsp), %rcx + sbbq 0x8(%rsi), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x10(%rsi), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x18(%rsi), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0xa0(%rsp) + adcq %r10, %rcx + movq %rcx, 0xa8(%rsp) + adcq $0x0, %r8 + movq %r8, 0xb0(%rsp) + adcq %rdx, %r9 + movq %r9, 0xb8(%rsp) + movq 0x20(%rsp), %rax + subq 0x20(%rsi), %rax + movq 0x28(%rsp), %rcx + sbbq 0x28(%rsi), %rcx + movq 0x30(%rsp), %r8 + sbbq 0x30(%rsi), %r8 + movq 0x38(%rsp), %r9 + sbbq 0x38(%rsi), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x20(%rsp) + adcq %r10, %rcx + movq %rcx, 0x28(%rsp) + adcq $0x0, %r8 + movq %r8, 0x30(%rsp) + adcq %rdx, %r9 + movq %r9, 0x38(%rsp) + movq 0xa0(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0xa8(%rsp), %r9, %r10 + mulxq 0xb8(%rsp), %r11, %r12 + movq 0xb0(%rsp), %rdx + mulxq 0xb8(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0xb8(%rsp), %rdx + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0xa8(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0xb0(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0xb8(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %r8d + leaq -0x1(%rdx), %rdx + leaq -0x1(%rcx), %rax + movl $0xfffffffe, %r11d + cmoveq %rcx, %r8 + cmoveq %rcx, %rdx + cmoveq %rcx, %rax + cmoveq %rcx, %r11 + addq %r8, %r12 + adcq %rdx, %r13 + adcq %rax, %r14 + adcq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x20(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x28(%rsp), %r9, %r10 + mulxq 0x38(%rsp), %r11, %r12 + movq 0x30(%rsp), %rdx + mulxq 0x38(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x38(%rsp), %rdx + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x28(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x30(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x38(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %ebx + addq %r12, %rbx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rcx), %rcx + movq %rcx, %rax + adcq %r14, %rcx + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rbx, %r12 + cmovbq %rdx, %r13 + cmovbq %rcx, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + xorl %r13d, %r13d + movq (%rsi), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x70(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x78(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x8(%rsi), %rdx + xorl %r14d, %r14d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x10(%rsi), %rdx + xorl %r8d, %r8d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x18(%rsi), %rdx + xorl %r9d, %r9d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + xorl %r13d, %r13d + movq 0x40(%rsp), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x70(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x78(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x48(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x50(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x58(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq (%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq $0x0, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + xorl %r13d, %r13d + movq 0x40(%rsi), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0xb0(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0xb8(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x48(%rsi), %rdx + xorl %r14d, %r14d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x50(%rsi), %rdx + xorl %r8d, %r8d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x58(%rsi), %rdx + xorl %r9d, %r9d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq (%rsp), %rax + subq 0x40(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x48(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x50(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x58(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x80(%rsp), %rax + subq (%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + xorl %r13d, %r13d + movq 0x20(%rsi), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x70(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x78(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rsi), %rdx + xorl %r14d, %r14d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rsi), %rdx + xorl %r8d, %r8d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rsi), %rdx + xorl %r9d, %r9d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + xorl %r13d, %r13d + movq 0x80(%rsp), %rdx + mulxq 0x20(%rsp), %r8, %r9 + mulxq 0x28(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x30(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x38(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x88(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x38(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x90(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x38(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x98(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x38(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x80(%rsp), %rax + subq 0x60(%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x68(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x70(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x78(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x40(%rsi), %rax + movq 0x48(%rsi), %rdx + orq 0x50(%rsi), %rax + orq 0x58(%rsi), %rdx + orq %rdx, %rax + movq (%rsp), %r8 + movq 0x0(%rbp), %rax + cmoveq %rax, %r8 + movq 0x8(%rsp), %r9 + movq 0x8(%rbp), %rax + cmoveq %rax, %r9 + movq 0x10(%rsp), %r10 + movq 0x10(%rbp), %rax + cmoveq %rax, %r10 + movq 0x18(%rsp), %r11 + movq 0x18(%rbp), %rax + cmoveq %rax, %r11 + movq 0x80(%rsp), %r12 + movq 0x20(%rbp), %rax + cmoveq %rax, %r12 + movq 0x88(%rsp), %r13 + movq 0x28(%rbp), %rax + cmoveq %rax, %r13 + movq 0x90(%rsp), %r14 + movq 0x30(%rbp), %rax + cmoveq %rax, %r14 + movq 0x98(%rsp), %r15 + movq 0x38(%rbp), %rax + cmoveq %rax, %r15 + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + movq %r12, 0x20(%rdi) + movq %r13, 0x28(%rdi) + movq %r14, 0x30(%rdi) + movq %r15, 0x38(%rdi) + movq 0xa0(%rsp), %r8 + movq 0xa8(%rsp), %r9 + movq 0xb0(%rsp), %r10 + movq 0xb8(%rsp), %r11 + movl $0x1, %eax + cmoveq %rax, %r8 + movq $0xffffffff00000000, %rax + cmoveq %rax, %r9 + movq $0xffffffffffffffff, %rax + cmoveq %rax, %r10 + movl $0xfffffffe, %eax + cmoveq %rax, %r11 + movq %r8, 0x40(%rdi) + movq %r9, 0x48(%rdi) + movq %r10, 0x50(%rdi) + movq %r11, 0x58(%rdi) + addq $0xc0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmul_alt.S new file mode 100644 index 00000000000..f1ec19f4491 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmul_alt.S @@ -0,0 +1,8651 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Scalar multiplication for P-256 +// Input scalar[4], point[8]; output res[8] +// +// extern void p256_scalarmul_alt +// (uint64_t res[static 8],uint64_t scalar[static 4], +// uint64_t point[static 8]); +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-256, returns the point (X,Y) = n * P. The input and output +// are affine points, and in the case of the point at infinity as +// the result, (0,0) is returned. +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmul_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmul_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Intermediate variables on the stack. The last z2, z3 values can +// safely be overlaid on the table, which is no longer needed at the end. +// Uppercase syntactic variants make x86_att version simpler to generate + +#define SCALARB (0*NUMSIZE) +#define scalarb (0*NUMSIZE)(%rsp) +#define ACC (1*NUMSIZE) +#define acc (1*NUMSIZE)(%rsp) +#define TABENT (4*NUMSIZE) +#define tabent (4*NUMSIZE)(%rsp) + +#define TAB (7*NUMSIZE) +#define tab (7*NUMSIZE)(%rsp) + +#define Z2 (7*NUMSIZE) +#define z2 (7*NUMSIZE)(%rsp) +#define Z3 (8*NUMSIZE) +#define z3 (8*NUMSIZE)(%rsp) + +#define res (31*NUMSIZE)(%rsp) + +#define NSPACE (32*NUMSIZE) + +S2N_BN_SYMBOL(p256_scalarmul_alt): + _CET_ENDBR + +// The Windows version literally calls the standard ABI version. +// This simplifies the proofs since subroutine offsets are fixed. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + callq p256_scalarmul_alt_standard + popq %rsi + popq %rdi + ret + +p256_scalarmul_alt_standard: +#endif + +// Real start of the standard ABI code. + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + subq $NSPACE, %rsp + +// Preserve the "res" and "point" input arguments. We load and process the +// scalar immediately so we don't bother preserving that input argument. +// Also, "point" is only needed early on and so its register gets re-used. + + movq %rdx, %rbx + movq %rdi, res + +// Load the digits of group order n_256 = [%r15;%r14;%r13;%r12] + + movq $0xf3b9cac2fc632551, %r12 + movq $0xbce6faada7179e84, %r13 + movq $0xffffffffffffffff, %r14 + movq $0xffffffff00000000, %r15 + +// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256 + + movq (%rsi), %r8 + subq %r12, %r8 + movq 8(%rsi), %r9 + sbbq %r13, %r9 + movq 16(%rsi), %r10 + sbbq %r14, %r10 + movq 24(%rsi), %r11 + sbbq %r15, %r11 + + cmovcq (%rsi), %r8 + cmovcq 8(%rsi), %r9 + cmovcq 16(%rsi), %r10 + cmovcq 24(%rsi), %r11 + +// Now if the top bit of the reduced scalar is set, negate it mod n_256, +// i.e. do n |-> n_256 - n. Remember the sign in %rbp so we can +// correspondingly negate the point below. + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + movq %r11, %rbp + shrq $63, %rbp + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + +// In either case then add the recoding constant 0x08888...888 to allow +// signed digits. + + movq $0x8888888888888888, %rax + addq %rax, %r8 + adcq %rax, %r9 + adcq %rax, %r10 + adcq %rax, %r11 + btc $63, %r11 + + movq %r8, SCALARB(%rsp) + movq %r9, SCALARB+8(%rsp) + movq %r10, SCALARB+16(%rsp) + movq %r11, SCALARB+24(%rsp) + +// Set the tab[0] table entry to Montgomery-Jacobian point = 1 * P +// The z coordinate is just the Montgomery form of the constant 1. + + leaq TAB(%rsp), %rdi + movq %rbx, %rsi + callq p256_scalarmul_alt_local_tomont_p256 + + leaq 32(%rbx), %rsi + leaq TAB+32(%rsp), %rdi + callq p256_scalarmul_alt_local_tomont_p256 + + movl $1, %eax + movq %rax, TAB+64(%rsp) + movq $0xffffffff00000000, %rdx + movq %rdx, TAB+72(%rsp) + subq $2, %rax + movq %rax, TAB+80(%rsp) + movq $0x00000000fffffffe, %rax + movq %rax, TAB+88(%rsp) + +// If the top bit of the scalar was set, negate (y coordinate of) the point + + movq TAB+32(%rsp), %r12 + movq TAB+40(%rsp), %r13 + movq TAB+48(%rsp), %r14 + movq TAB+56(%rsp), %r15 + + xorl %r10d, %r10d + leaq -1(%r10), %r8 + movq $0x00000000ffffffff, %r11 + movq %r11, %r9 + negq %r11 + + subq %r12, %r8 + sbbq %r13, %r9 + sbbq %r14, %r10 + sbbq %r15, %r11 + + testq %rbp, %rbp + cmovzq %r12, %r8 + cmovzq %r13, %r9 + cmovzq %r14, %r10 + cmovzq %r15, %r11 + + movq %r8, TAB+32(%rsp) + movq %r9, TAB+40(%rsp) + movq %r10, TAB+48(%rsp) + movq %r11, TAB+56(%rsp) + +// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P + + leaq TAB+96*1(%rsp), %rdi + leaq TAB(%rsp), %rsi + callq p256_scalarmul_alt_local_p256_montjdouble + + leaq TAB+96*2(%rsp), %rdi + leaq TAB+96*1(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p256_scalarmul_alt_local_p256_montjmixadd + + leaq TAB+96*3(%rsp), %rdi + leaq TAB+96*1(%rsp), %rsi + callq p256_scalarmul_alt_local_p256_montjdouble + + leaq TAB+96*4(%rsp), %rdi + leaq TAB+96*3(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p256_scalarmul_alt_local_p256_montjmixadd + + leaq TAB+96*5(%rsp), %rdi + leaq TAB+96*2(%rsp), %rsi + callq p256_scalarmul_alt_local_p256_montjdouble + + leaq TAB+96*6(%rsp), %rdi + leaq TAB+96*5(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p256_scalarmul_alt_local_p256_montjmixadd + + leaq TAB+96*7(%rsp), %rdi + leaq TAB+96*3(%rsp), %rsi + callq p256_scalarmul_alt_local_p256_montjdouble + +// Set up accumulator as table entry for top 4 bits (constant-time indexing) + + movq SCALARB+24(%rsp), %rdi + shrq $60, %rdi + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + .set I, 1 +.rep 8 + cmpq $I, %rdi + + cmovzq TAB+96*(I-1)(%rsp), %rax + cmovzq TAB+96*(I-1)+8(%rsp), %rbx + cmovzq TAB+96*(I-1)+16(%rsp), %rcx + cmovzq TAB+96*(I-1)+24(%rsp), %rdx + cmovzq TAB+96*(I-1)+32(%rsp), %r8 + cmovzq TAB+96*(I-1)+40(%rsp), %r9 + cmovzq TAB+96*(I-1)+48(%rsp), %r10 + cmovzq TAB+96*(I-1)+56(%rsp), %r11 + cmovzq TAB+96*(I-1)+64(%rsp), %r12 + cmovzq TAB+96*(I-1)+72(%rsp), %r13 + cmovzq TAB+96*(I-1)+80(%rsp), %r14 + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + .set I, (I+1) +.endr + movq %rax, ACC(%rsp) + movq %rbx, ACC+8(%rsp) + movq %rcx, ACC+16(%rsp) + movq %rdx, ACC+24(%rsp) + movq %r8, ACC+32(%rsp) + movq %r9, ACC+40(%rsp) + movq %r10, ACC+48(%rsp) + movq %r11, ACC+56(%rsp) + movq %r12, ACC+64(%rsp) + movq %r13, ACC+72(%rsp) + movq %r14, ACC+80(%rsp) + movq %r15, ACC+88(%rsp) + +// Main loop over size-4 bitfield + + movl $252, %ebp + +p256_scalarmul_alt_loop: + subq $4, %rbp + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p256_scalarmul_alt_local_p256_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p256_scalarmul_alt_local_p256_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p256_scalarmul_alt_local_p256_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p256_scalarmul_alt_local_p256_montjdouble + + movq %rbp, %rax + shrq $6, %rax + movq (%rsp,%rax,8), %rdi + movq %rbp, %rcx + shrq %cl, %rdi + andq $15, %rdi + + subq $8, %rdi + sbbq %rsi, %rsi // %rsi = sign of digit (-1 = negative) + xorq %rsi, %rdi + subq %rsi, %rdi // %rdi = absolute value of digit + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + .set I, 1 +.rep 8 + cmpq $I, %rdi + + cmovzq TAB+96*(I-1)(%rsp), %rax + cmovzq TAB+96*(I-1)+8(%rsp), %rbx + cmovzq TAB+96*(I-1)+16(%rsp), %rcx + cmovzq TAB+96*(I-1)+24(%rsp), %rdx + cmovzq TAB+96*(I-1)+32(%rsp), %r8 + cmovzq TAB+96*(I-1)+40(%rsp), %r9 + cmovzq TAB+96*(I-1)+48(%rsp), %r10 + cmovzq TAB+96*(I-1)+56(%rsp), %r11 + cmovzq TAB+96*(I-1)+64(%rsp), %r12 + cmovzq TAB+96*(I-1)+72(%rsp), %r13 + cmovzq TAB+96*(I-1)+80(%rsp), %r14 + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + .set I, (I+1) +.endr + + movq %r12, TABENT+64(%rsp) + movq %r13, TABENT+72(%rsp) + movq %r14, TABENT+80(%rsp) + movq %r15, TABENT+88(%rsp) + + xorl %r14d, %r14d + leaq -1(%r14), %r12 + movq $0x00000000ffffffff, %r15 + movq %r15, %r13 + negq %r15 + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + testq %rsi, %rsi + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + + movq %r8, TABENT+32(%rsp) + movq %r9, TABENT+40(%rsp) + movq %r10, TABENT+48(%rsp) + movq %r11, TABENT+56(%rsp) + + leaq TABENT(%rsp), %rdx + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p256_scalarmul_alt_local_p256_montjadd + + testq %rbp, %rbp + jne p256_scalarmul_alt_loop + +// Let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form + + leaq Z2(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + callq p256_scalarmul_alt_local_montsqr_p256 + + leaq Z3(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + leaq Z2(%rsp), %rdx + callq p256_scalarmul_alt_local_montmul_p256 + + leaq Z2(%rsp), %rdi + leaq Z3(%rsp), %rsi + callq p256_scalarmul_alt_local_demont_p256 + + leaq Z3(%rsp), %rdi + leaq Z2(%rsp), %rsi + callq p256_scalarmul_alt_local_inv_p256 + + leaq Z2(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + leaq Z3(%rsp), %rdx + callq p256_scalarmul_alt_local_montmul_p256 + +// Convert back from Jacobian (X, Y, Z) |-> (X/Z^2, Y/Z^3) + + movq res, %rdi + leaq ACC(%rsp), %rsi + leaq Z2(%rsp), %rdx + movq %rdi, %rbx + callq p256_scalarmul_alt_local_montmul_p256 + + leaq 32(%rbx), %rdi + leaq ACC+32(%rsp), %rsi + leaq Z3(%rsp), %rdx + callq p256_scalarmul_alt_local_montmul_p256 + +// Restore stack and registers and return + + addq $NSPACE, %rsp + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + ret + +// Local copies of subroutines, complete clones at the moment + +p256_scalarmul_alt_local_demont_p256: + movq (%rsi), %r8 + movq 0x8(%rsi), %r9 + movq 0x10(%rsi), %r10 + movq 0x18(%rsi), %r11 + movabsq $0x100000000, %rcx + movq %r8, %rax + mulq %rcx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rsi, %rsi + movq %r9, %rax + mulq %rcx + subq %rsi, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rsi, %rsi + negq %rcx + negq %rsi + incq %rcx + movq %r8, %rax + mulq %rcx + addq %rax, %r11 + adcq %rdx, %rsi + sbbq %r8, %r8 + negq %r8 + movq %r9, %rax + mulq %rcx + addq %rax, %rsi + adcq %rdx, %r8 + negq %rcx + incq %rcx + movq %r10, %rax + mulq %rcx + addq %rax, %r11 + adcq %rdx, %rsi + sbbq %r9, %r9 + movq %r11, %rax + mulq %rcx + subq %r9, %rdx + addq %rax, %rsi + adcq %rdx, %r8 + sbbq %r9, %r9 + negq %rcx + negq %r9 + incq %rcx + movq %r10, %rax + mulq %rcx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %r10, %r10 + negq %r10 + movq %r11, %rax + mulq %rcx + addq %rax, %r9 + adcq %rdx, %r10 + movq %rsi, (%rdi) + movq %r8, 0x8(%rdi) + movq %r9, 0x10(%rdi) + movq %r10, 0x18(%rdi) + ret + +p256_scalarmul_alt_local_inv_p256: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xf0, %rsp + movq %rdi, 0xe0(%rsp) + xorl %ecx, %ecx + movl $0xffffffff, %edx + movq %rdx, %rbx + leaq -0x1(%rcx), %rax + negq %rdx + movq %rax, (%rsp) + movq %rbx, 0x8(%rsp) + movq %rcx, 0x10(%rsp) + movq %rdx, 0x18(%rsp) + movq %rcx, 0x20(%rsp) + movq (%rsi), %r8 + movq 0x8(%rsi), %r9 + movq 0x10(%rsi), %r10 + movq 0x18(%rsi), %r11 + leaq 0x1(%rcx), %rax + addq %r8, %rax + leaq -0x1(%rdx), %rbx + adcq %r9, %rbx + notq %rcx + adcq %r10, %rcx + notq %rdx + adcq %r11, %rdx + cmovaeq %r8, %rax + cmovaeq %r9, %rbx + cmovaeq %r10, %rcx + cmovaeq %r11, %rdx + movq %rax, 0x28(%rsp) + movq %rbx, 0x30(%rsp) + movq %rcx, 0x38(%rsp) + movq %rdx, 0x40(%rsp) + xorl %eax, %eax + movq %rax, 0x48(%rsp) + xorl %eax, %eax + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movq %rax, 0x60(%rsp) + movq %rax, 0x68(%rsp) + movabsq $0x4000000000000, %rcx + movq %rcx, 0x78(%rsp) + movq %rax, 0x80(%rsp) + movq %rax, 0x88(%rsp) + movq %rax, 0x90(%rsp) + movq $0xa, 0xb0(%rsp) + movq $0x1, 0xb8(%rsp) + jmp p256_scalarmul_alt_inv_midloop +p256_scalarmul_alt_inv_loop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0xa0(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0xa8(%rsp) + xorl %ebx, %ebx + movq (%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + xorl %ecx, %ecx + movq 0x8(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x28(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x38(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x30(%rsp) + movq 0x18(%rsp), %rax + xorq %r9, %rax + movq 0x20(%rsp), %rbp + xorq %r9, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x40(%rsp), %rax + xorq %r11, %rax + movq 0x48(%rsp), %rdx + xorq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + sarq $0x3b, %rbp + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + movq 0x20(%rsp), %rsi + movq %rbp, 0x20(%rsp) + xorq %r13, %rax + xorq %r13, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x40(%rsp), %rax + xorq %r15, %rax + movq 0x48(%rsp), %rdx + xorq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x38(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x40(%rsp) + sarq $0x3b, %rsi + movq %rsi, 0x48(%rsp) + movq 0xa0(%rsp), %rbx + movq 0xa8(%rsp), %rbp + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x78(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x78(%rsp) + xorl %ebx, %ebx + movq 0x58(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x80(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x58(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x58(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x80(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x80(%rsp) + xorl %ecx, %ecx + movq 0x60(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x88(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x60(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x60(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x88(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x88(%rsp) + movq 0x68(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x90(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq 0x68(%rsp), %rax + movq %rcx, 0x68(%rsp) + movq %rdx, 0x70(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq 0x90(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rsi, 0x90(%rsp) + movq %rdx, 0x98(%rsp) + movabsq $0xe000000000000000, %r8 + addq 0x50(%rsp), %r8 + movq $0xffffffffffffffff, %r9 + adcq 0x58(%rsp), %r9 + movq $0x1fffffff, %r10 + adcq 0x60(%rsp), %r10 + movabsq $0x2000000000000000, %r11 + adcq 0x68(%rsp), %r11 + movabsq $0x1fffffffe0000000, %r12 + adcq 0x70(%rsp), %r12 + movq %r8, %rbx + shlq $0x20, %rbx + movabsq $0xffffffff00000001, %rax + mulq %r8 + shrq $0x20, %r8 + addq %rbx, %r9 + adcq %r8, %r10 + adcq %rax, %r11 + adcq %rdx, %r12 + sbbq %rax, %rax + movl $0xffffffff, %ebx + andq %rax, %rbx + movabsq $0xffffffff00000001, %rdx + andq %rax, %rdx + subq %rax, %r9 + movq %r9, 0x50(%rsp) + sbbq %rbx, %r10 + movq %r10, 0x58(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x60(%rsp) + sbbq %rdx, %r12 + movq %r12, 0x68(%rsp) + movabsq $0xe000000000000000, %r8 + addq 0x78(%rsp), %r8 + movq $0xffffffffffffffff, %r9 + adcq 0x80(%rsp), %r9 + movq $0x1fffffff, %r10 + adcq 0x88(%rsp), %r10 + movabsq $0x2000000000000000, %r11 + adcq 0x90(%rsp), %r11 + movabsq $0x1fffffffe0000000, %r12 + adcq 0x98(%rsp), %r12 + movq %r8, %rbx + shlq $0x20, %rbx + movabsq $0xffffffff00000001, %rax + mulq %r8 + shrq $0x20, %r8 + addq %rbx, %r9 + adcq %r8, %r10 + adcq %rax, %r11 + adcq %rdx, %r12 + sbbq %rax, %rax + movl $0xffffffff, %ebx + andq %rax, %rbx + movabsq $0xffffffff00000001, %rdx + andq %rax, %rdx + subq %rax, %r9 + movq %r9, 0x78(%rsp) + sbbq %rbx, %r10 + movq %r10, 0x80(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x88(%rsp) + sbbq %rdx, %r12 + movq %r12, 0x90(%rsp) +p256_scalarmul_alt_inv_midloop: + movq 0xb8(%rsp), %rsi + movq (%rsp), %rdx + movq 0x28(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xc0(%rsp) + movq %rbx, 0xc8(%rsp) + movq %rdi, 0xd0(%rsp) + movq %rcx, 0xd8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x28(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xc0(%rsp), %rax + imulq %r8, %rax + movq 0xd0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xc8(%rsp), %r8 + imulq 0xd8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xc0(%rsp), %rax + imulq %r10, %rax + movq 0xd0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xc8(%rsp), %r10 + imulq 0xd8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0xb8(%rsp) + decq 0xb0(%rsp) + jne p256_scalarmul_alt_inv_loop + movq (%rsp), %rax + movq 0x28(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x78(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x58(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x80(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x88(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x68(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x90(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r12, 0x50(%rsp) + movq %r13, 0x58(%rsp) + movq %r14, 0x60(%rsp) + movq %r15, 0x68(%rsp) + movq %r9, 0x70(%rsp) + movabsq $0xe000000000000000, %r8 + addq 0x50(%rsp), %r8 + movq $0xffffffffffffffff, %r9 + adcq 0x58(%rsp), %r9 + movq $0x1fffffff, %r10 + adcq 0x60(%rsp), %r10 + movabsq $0x2000000000000000, %r11 + adcq 0x68(%rsp), %r11 + movabsq $0x1fffffffe0000000, %r12 + adcq 0x70(%rsp), %r12 + movq %r8, %rbx + shlq $0x20, %rbx + movabsq $0xffffffff00000001, %rax + mulq %r8 + shrq $0x20, %r8 + addq %rbx, %r9 + adcq %r8, %r10 + adcq %rax, %r11 + adcq %rdx, %r12 + sbbq %rax, %rax + movl $0xffffffff, %ebx + andq %rax, %rbx + movabsq $0xffffffff00000001, %rdx + andq %rax, %rdx + subq %rax, %r9 + movq %r9, 0x50(%rsp) + sbbq %rbx, %r10 + movq %r10, 0x58(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x60(%rsp) + sbbq %rdx, %r12 + movq %r12, 0x68(%rsp) + movq 0x50(%rsp), %r8 + movq 0x58(%rsp), %r9 + movq 0x60(%rsp), %r10 + movq 0x68(%rsp), %r11 + movl $0x1, %eax + movl $0xffffffff, %ebx + leaq -0x2(%rax), %rcx + leaq -0x1(%rbx), %rdx + notq %rbx + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + cmovaeq %r8, %rax + cmovaeq %r9, %rbx + cmovaeq %r10, %rcx + cmovaeq %r11, %rdx + movq 0xe0(%rsp), %rdi + movq %rax, (%rdi) + movq %rbx, 0x8(%rdi) + movq %rcx, 0x10(%rdi) + movq %rdx, 0x18(%rdi) + addq $0xf0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p256_scalarmul_alt_local_montmul_p256: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx, %rcx + movq (%rcx), %rbx + movq (%rsi), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsi), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsi), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsi), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x8(%rcx), %rbx + xorl %r13d, %r13d + movq (%rsi), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rcx), %rbx + xorl %r15d, %r15d + movq (%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rcx), %rbx + xorl %r8d, %r8d + movq (%rsi), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret + +p256_scalarmul_alt_local_montsqr_p256: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq (%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x8(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x18(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x10(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq (%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x8(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x18(%rsi), %rbx + movq 0x8(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x8(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x10(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x18(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret + + +p256_scalarmul_alt_local_tomont_p256: + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movl $0x3, %ecx + movq (%rsi), %rax + mulq %rcx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsi), %rax + mulq %rcx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsi), %rax + mulq %rcx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsi), %rax + mulq %rcx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movabsq $0xfffffffbffffffff, %rcx + xorl %r13d, %r13d + movq (%rsi), %rax + mulq %rcx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsi), %rax + mulq %rcx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsi), %rax + mulq %rcx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsi), %rax + mulq %rcx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rcx + movq %r8, %rax + mulq %rcx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rcx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rcx + leaq 0x2(%rcx), %rcx + movq %r8, %rax + mulq %rcx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rcx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq $0xfffffffffffffffe, %rcx + xorl %r15d, %r15d + movq (%rsi), %rax + mulq %rcx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsi), %rax + mulq %rcx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsi), %rax + mulq %rcx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsi), %rax + mulq %rcx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movabsq $0x4fffffffd, %rcx + xorl %r8d, %r8d + movq (%rsi), %rax + mulq %rcx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsi), %rax + mulq %rcx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsi), %rax + mulq %rcx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsi), %rax + mulq %rcx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rcx + movq %r10, %rax + mulq %rcx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq %r11, %rax + mulq %rcx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + notq %rcx + leaq 0x2(%rcx), %rcx + movq %r10, %rax + mulq %rcx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq %r11, %rax + mulq %rcx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + sbbq %r9, %r9 + subq %r9, %r8 + xorl %edx, %edx + leaq -0x1(%rdx), %r9 + incq %rdx + addq %r12, %rdx + decq %rcx + adcq %r13, %rcx + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rdx, %r12 + cmovbq %rcx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + ret + +p256_scalarmul_alt_local_p256_montjadd: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xe0, %rsp + movq %rdx, %rbp + movq 0x40(%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rsi), %rbx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x40(%rbp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rbp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rbp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rbp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rbp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rbp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rbp), %rbx + movq 0x48(%rbp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rbp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rbp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rbp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq 0x20(%rsi), %rbx + movq 0x40(%rbp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x48(%rbp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x50(%rbp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x58(%rbp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rsi), %rbx + xorl %r13d, %r13d + movq 0x40(%rbp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x48(%rbp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x50(%rbp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x58(%rbp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rsi), %rbx + xorl %r15d, %r15d + movq 0x40(%rbp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x48(%rbp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x50(%rbp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x58(%rbp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rsi), %rbx + xorl %r8d, %r8d + movq 0x40(%rbp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x48(%rbp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x50(%rbp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x58(%rbp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xc0(%rsp) + movq %r13, 0xc8(%rsp) + movq %r14, 0xd0(%rsp) + movq %r15, 0xd8(%rsp) + movq 0x20(%rbp), %rbx + movq 0x40(%rsi), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x48(%rsi), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x50(%rsi), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x58(%rsi), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rbp), %rbx + xorl %r13d, %r13d + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x48(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x50(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x58(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rbp), %rbx + xorl %r15d, %r15d + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x48(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x50(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x58(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rbp), %rbx + xorl %r8d, %r8d + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x48(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x50(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x58(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq 0x0(%rbp), %rbx + movq (%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x8(%rbp), %rbx + xorl %r13d, %r13d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rbp), %rbx + xorl %r15d, %r15d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rbp), %rbx + xorl %r8d, %r8d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq (%rsi), %rbx + movq 0xa0(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xb0(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0xb8(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x8(%rsi), %rbx + xorl %r13d, %r13d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rsi), %rbx + xorl %r15d, %r15d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rsi), %rbx + xorl %r8d, %r8d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x20(%rsp), %rbx + movq (%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rsp), %rbx + xorl %r13d, %r13d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rsp), %rbx + xorl %r15d, %r15d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rsp), %rbx + xorl %r8d, %r8d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq 0xc0(%rsp), %rbx + movq 0xa0(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xb0(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0xb8(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0xc8(%rsp), %rbx + xorl %r13d, %r13d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0xd0(%rsp), %rbx + xorl %r15d, %r15d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0xd8(%rsp), %rbx + xorl %r8d, %r8d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xc0(%rsp) + movq %r13, 0xc8(%rsp) + movq %r14, 0xd0(%rsp) + movq %r15, 0xd8(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0xa0(%rsp) + adcq %r10, %rcx + movq %rcx, 0xa8(%rsp) + adcq $0x0, %r8 + movq %r8, 0xb0(%rsp) + adcq %rdx, %r9 + movq %r9, 0xb8(%rsp) + movq 0x20(%rsp), %rax + subq 0xc0(%rsp), %rax + movq 0x28(%rsp), %rcx + sbbq 0xc8(%rsp), %rcx + movq 0x30(%rsp), %r8 + sbbq 0xd0(%rsp), %r8 + movq 0x38(%rsp), %r9 + sbbq 0xd8(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x20(%rsp) + adcq %r10, %rcx + movq %rcx, 0x28(%rsp) + adcq $0x0, %r8 + movq %r8, 0x30(%rsp) + adcq %rdx, %r9 + movq %r9, 0x38(%rsp) + movq 0xa0(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0xa8(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0xb8(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0xb0(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0xa8(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0xb8(%rsp), %rbx + movq 0xa8(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0xa8(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0xb0(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0xb8(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x20(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x28(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x38(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x30(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x38(%rsp), %rbx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x30(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x38(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x80(%rsp), %rbx + movq 0x60(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x70(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x78(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x88(%rsp), %rbx + xorl %r13d, %r13d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x90(%rsp), %rbx + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x98(%rsp), %rbx + xorl %r8d, %r8d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x40(%rsp), %rbx + movq 0x60(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x70(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x78(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x48(%rsp), %rbx + xorl %r13d, %r13d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rsp), %rbx + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rsp), %rbx + xorl %r8d, %r8d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq (%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq $0x0, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + movq 0x40(%rsi), %rbx + movq 0xa0(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xb0(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0xb8(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x48(%rsi), %rbx + xorl %r13d, %r13d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rsi), %rbx + xorl %r15d, %r15d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rsi), %rbx + xorl %r8d, %r8d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq (%rsp), %rax + subq 0x40(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x48(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x50(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x58(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x80(%rsp), %rax + subq (%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0xc0(%rsp), %rbx + movq 0x60(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x70(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x78(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0xc8(%rsp), %rbx + xorl %r13d, %r13d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0xd0(%rsp), %rbx + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0xd8(%rsp), %rbx + xorl %r8d, %r8d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x40(%rbp), %rbx + movq 0xa0(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xb0(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0xb8(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x48(%rbp), %rbx + xorl %r13d, %r13d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rbp), %rbx + xorl %r15d, %r15d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rbp), %rbx + xorl %r8d, %r8d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq 0x80(%rsp), %rbx + movq 0x20(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x28(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x30(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x38(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x88(%rsp), %rbx + xorl %r13d, %r13d + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x28(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x30(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x38(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x90(%rsp), %rbx + xorl %r15d, %r15d + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x28(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x30(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x38(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x98(%rsp), %rbx + xorl %r8d, %r8d + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x28(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x30(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x38(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x80(%rsp), %rax + subq 0x60(%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x68(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x70(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x78(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x40(%rsi), %r8 + movq 0x48(%rsi), %r9 + movq 0x50(%rsi), %r10 + movq 0x58(%rsi), %r11 + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + movq 0x40(%rbp), %r12 + movq 0x48(%rbp), %r13 + movq 0x50(%rbp), %r14 + movq 0x58(%rbp), %r15 + movq %r12, %rbx + movq %r13, %rdx + orq %r14, %rbx + orq %r15, %rdx + orq %rdx, %rbx + negq %rbx + sbbq %rbx, %rbx + cmpq %rax, %rbx + cmovbq %r8, %r12 + cmovbq %r9, %r13 + cmovbq %r10, %r14 + cmovbq %r11, %r15 + cmoveq 0xa0(%rsp), %r12 + cmoveq 0xa8(%rsp), %r13 + cmoveq 0xb0(%rsp), %r14 + cmoveq 0xb8(%rsp), %r15 + movq (%rsp), %rax + cmovbq (%rsi), %rax + cmova 0x0(%rbp), %rax + movq 0x8(%rsp), %rbx + cmovbq 0x8(%rsi), %rbx + cmova 0x8(%rbp), %rbx + movq 0x10(%rsp), %rcx + cmovbq 0x10(%rsi), %rcx + cmova 0x10(%rbp), %rcx + movq 0x18(%rsp), %rdx + cmovbq 0x18(%rsi), %rdx + cmova 0x18(%rbp), %rdx + movq 0x80(%rsp), %r8 + cmovbq 0x20(%rsi), %r8 + cmova 0x20(%rbp), %r8 + movq 0x88(%rsp), %r9 + cmovbq 0x28(%rsi), %r9 + cmova 0x28(%rbp), %r9 + movq 0x90(%rsp), %r10 + cmovbq 0x30(%rsi), %r10 + cmova 0x30(%rbp), %r10 + movq 0x98(%rsp), %r11 + cmovbq 0x38(%rsi), %r11 + cmova 0x38(%rbp), %r11 + movq %rax, (%rdi) + movq %rbx, 0x8(%rdi) + movq %rcx, 0x10(%rdi) + movq %rdx, 0x18(%rdi) + movq %r8, 0x20(%rdi) + movq %r9, 0x28(%rdi) + movq %r10, 0x30(%rdi) + movq %r11, 0x38(%rdi) + movq %r12, 0x40(%rdi) + movq %r13, 0x48(%rdi) + movq %r14, 0x50(%rdi) + movq %r15, 0x58(%rdi) + addq $0xe0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p256_scalarmul_alt_local_p256_montjdouble: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xc0, %rsp + movq 0x40(%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rsi), %rbx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x20(%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x28(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x38(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x30(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x20(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x28(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x38(%rsi), %rbx + movq 0x28(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x28(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x30(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x38(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq (%rsi), %rax + subq (%rsp), %rax + movq 0x8(%rsi), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x10(%rsi), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x18(%rsi), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq $0x0, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + movq (%rsi), %rax + addq (%rsp), %rax + movq 0x8(%rsi), %rcx + adcq 0x8(%rsp), %rcx + movq 0x10(%rsi), %r8 + adcq 0x10(%rsp), %r8 + movq 0x18(%rsi), %r9 + adcq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + subq %r11, %rax + movq %rax, 0x40(%rsp) + sbbq %r10, %rcx + movq %rcx, 0x48(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x50(%rsp) + sbbq %rdx, %r9 + movq %r9, 0x58(%rsp) + movq 0x60(%rsp), %rbx + movq 0x40(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x48(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x50(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x58(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x68(%rsp), %rbx + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x48(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x50(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x58(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x70(%rsp), %rbx + xorl %r15d, %r15d + movq 0x40(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x48(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x50(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x58(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x78(%rsp), %rbx + xorl %r8d, %r8d + movq 0x40(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x48(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x50(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x58(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + xorq %r11, %r11 + movq 0x20(%rsi), %rax + addq 0x40(%rsi), %rax + movq 0x28(%rsi), %rcx + adcq 0x48(%rsi), %rcx + movq 0x30(%rsi), %r8 + adcq 0x50(%rsi), %r8 + movq 0x38(%rsi), %r9 + adcq 0x58(%rsi), %r9 + adcq %r11, %r11 + subq $0xffffffffffffffff, %rax + movl $0xffffffff, %r10d + sbbq %r10, %rcx + sbbq $0x0, %r8 + movabsq $0xffffffff00000001, %rdx + sbbq %rdx, %r9 + sbbq $0x0, %r11 + andq %r11, %r10 + andq %r11, %rdx + addq %r11, %rax + movq %rax, 0x40(%rsp) + adcq %r10, %rcx + movq %rcx, 0x48(%rsp) + adcq $0x0, %r8 + movq %r8, 0x50(%rsp) + adcq %rdx, %r9 + movq %r9, 0x58(%rsp) + movq 0x20(%rsp), %rbx + movq (%rsi), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsi), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsi), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsi), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rsp), %rbx + xorl %r13d, %r13d + movq (%rsi), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rsp), %rbx + xorl %r15d, %r15d + movq (%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rsp), %rbx + xorl %r8d, %r8d + movq (%rsi), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x60(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x68(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x78(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x70(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x68(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x78(%rsp), %rbx + movq 0x68(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x68(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x70(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x78(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq 0x40(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rsp), %rbx + movq 0x48(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq $0xffffffffffffffff, %r9 + xorl %r11d, %r11d + subq 0xa0(%rsp), %r9 + movabsq $0xffffffff, %r10 + sbbq 0xa8(%rsp), %r10 + sbbq 0xb0(%rsp), %r11 + movabsq $0xffffffff00000001, %r12 + sbbq 0xb8(%rsp), %r12 + movq $0x9, %rcx + movq %r9, %rax + mulq %rcx + movq %rax, %r8 + movq %rdx, %r9 + movq %r10, %rax + xorl %r10d, %r10d + mulq %rcx + addq %rax, %r9 + adcq %rdx, %r10 + movq %r11, %rax + xorl %r11d, %r11d + mulq %rcx + addq %rax, %r10 + adcq %rdx, %r11 + movq %r12, %rax + xorl %r12d, %r12d + mulq %rcx + addq %rax, %r11 + adcq %rdx, %r12 + movl $0xc, %ecx + movq 0x80(%rsp), %rax + mulq %rcx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rbx, %rbx + movq 0x88(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rbx, %rbx + movq 0x90(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rbx, %rbx + movq 0x98(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + leaq 0x1(%r12), %rcx + movabsq $0xffffffff00000001, %rax + mulq %rcx + movq %rcx, %rbx + shlq $0x20, %rbx + addq %rcx, %r8 + sbbq $0x0, %rbx + subq %rbx, %r9 + sbbq $0x0, %r10 + sbbq %rax, %r11 + sbbq %rdx, %rcx + decq %rcx + movl $0xffffffff, %eax + andq %rcx, %rax + xorl %edx, %edx + subq %rax, %rdx + addq %rcx, %r8 + movq %r8, 0xa0(%rsp) + adcq %rax, %r9 + movq %r9, 0xa8(%rsp) + adcq $0x0, %r10 + movq %r10, 0xb0(%rsp) + adcq %rdx, %r11 + movq %r11, 0xb8(%rsp) + movq 0x40(%rsp), %rax + subq (%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x40(%rsp) + adcq %r10, %rcx + movq %rcx, 0x48(%rsp) + adcq $0x0, %r8 + movq %r8, 0x50(%rsp) + adcq %rdx, %r9 + movq %r9, 0x58(%rsp) + movq 0x20(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x28(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x38(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x30(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x38(%rsp), %rbx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x30(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x38(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x60(%rsp), %rbx + movq 0xa0(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xb0(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0xb8(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x68(%rsp), %rbx + xorl %r13d, %r13d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x70(%rsp), %rbx + xorl %r15d, %r15d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x78(%rsp), %rbx + xorl %r8d, %r8d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x40(%rsp), %rax + subq 0x20(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x28(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x30(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x38(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x40(%rdi) + adcq %r10, %rcx + movq %rcx, 0x48(%rdi) + adcq $0x0, %r8 + movq %r8, 0x50(%rdi) + adcq %rdx, %r9 + movq %r9, 0x58(%rdi) + movq 0x98(%rsp), %r11 + movq %r11, %rcx + movq 0x90(%rsp), %r10 + shldq $0x2, %r10, %r11 + movq 0x88(%rsp), %r9 + shldq $0x2, %r9, %r10 + movq 0x80(%rsp), %r8 + shldq $0x2, %r8, %r9 + shlq $0x2, %r8 + shrq $0x3e, %rcx + addq $0x1, %rcx + subq 0xa0(%rsp), %r8 + sbbq 0xa8(%rsp), %r9 + sbbq 0xb0(%rsp), %r10 + sbbq 0xb8(%rsp), %r11 + sbbq $0x0, %rcx + movabsq $0xffffffff00000001, %rax + mulq %rcx + movq %rcx, %rbx + shlq $0x20, %rbx + addq %rcx, %r8 + sbbq $0x0, %rbx + subq %rbx, %r9 + sbbq $0x0, %r10 + sbbq %rax, %r11 + sbbq %rdx, %rcx + decq %rcx + movl $0xffffffff, %eax + andq %rcx, %rax + xorl %edx, %edx + subq %rax, %rdx + addq %rcx, %r8 + movq %r8, (%rdi) + adcq %rax, %r9 + movq %r9, 0x8(%rdi) + adcq $0x0, %r10 + movq %r10, 0x10(%rdi) + adcq %rdx, %r11 + movq %r11, 0x18(%rdi) + movq $0xffffffffffffffff, %r8 + xorl %r10d, %r10d + subq (%rsp), %r8 + movabsq $0xffffffff, %r9 + sbbq 0x8(%rsp), %r9 + sbbq 0x10(%rsp), %r10 + movabsq $0xffffffff00000001, %r11 + sbbq 0x18(%rsp), %r11 + movq %r11, %r12 + shldq $0x3, %r10, %r11 + shldq $0x3, %r9, %r10 + shldq $0x3, %r8, %r9 + shlq $0x3, %r8 + shrq $0x3d, %r12 + movl $0x3, %ecx + movq 0x60(%rsp), %rax + mulq %rcx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rbx, %rbx + movq 0x68(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rbx, %rbx + movq 0x70(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rbx, %rbx + movq 0x78(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + leaq 0x1(%r12), %rcx + movabsq $0xffffffff00000001, %rax + mulq %rcx + movq %rcx, %rbx + shlq $0x20, %rbx + addq %rcx, %r8 + sbbq $0x0, %rbx + subq %rbx, %r9 + sbbq $0x0, %r10 + sbbq %rax, %r11 + sbbq %rdx, %rcx + decq %rcx + movl $0xffffffff, %eax + andq %rcx, %rax + xorl %edx, %edx + subq %rax, %rdx + addq %rcx, %r8 + movq %r8, 0x20(%rdi) + adcq %rax, %r9 + movq %r9, 0x28(%rdi) + adcq $0x0, %r10 + movq %r10, 0x30(%rdi) + adcq %rdx, %r11 + movq %r11, 0x38(%rdi) + addq $0xc0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret + +p256_scalarmul_alt_local_p256_montjmixadd: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xc0, %rsp + movq %rdx, %rbp + movq 0x40(%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rsi), %rbx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x20(%rbp), %rbx + movq 0x40(%rsi), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x48(%rsi), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x50(%rsi), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x58(%rsi), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rbp), %rbx + xorl %r13d, %r13d + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x48(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x50(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x58(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rbp), %rbx + xorl %r15d, %r15d + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x48(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x50(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x58(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rbp), %rbx + xorl %r8d, %r8d + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x48(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x50(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x58(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq 0x0(%rbp), %rbx + movq (%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x8(%rbp), %rbx + xorl %r13d, %r13d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rbp), %rbx + xorl %r15d, %r15d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rbp), %rbx + xorl %r8d, %r8d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq 0x20(%rsp), %rbx + movq (%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rsp), %rbx + xorl %r13d, %r13d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rsp), %rbx + xorl %r15d, %r15d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rsp), %rbx + xorl %r8d, %r8d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq 0x40(%rsp), %rax + subq (%rsi), %rax + movq 0x48(%rsp), %rcx + sbbq 0x8(%rsi), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x10(%rsi), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x18(%rsi), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0xa0(%rsp) + adcq %r10, %rcx + movq %rcx, 0xa8(%rsp) + adcq $0x0, %r8 + movq %r8, 0xb0(%rsp) + adcq %rdx, %r9 + movq %r9, 0xb8(%rsp) + movq 0x20(%rsp), %rax + subq 0x20(%rsi), %rax + movq 0x28(%rsp), %rcx + sbbq 0x28(%rsi), %rcx + movq 0x30(%rsp), %r8 + sbbq 0x30(%rsi), %r8 + movq 0x38(%rsp), %r9 + sbbq 0x38(%rsi), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x20(%rsp) + adcq %r10, %rcx + movq %rcx, 0x28(%rsp) + adcq $0x0, %r8 + movq %r8, 0x30(%rsp) + adcq %rdx, %r9 + movq %r9, 0x38(%rsp) + movq 0xa0(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0xa8(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0xb8(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0xb0(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0xa8(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0xb8(%rsp), %rbx + movq 0xa8(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0xa8(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0xb0(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0xb8(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x20(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x28(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x38(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x30(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x38(%rsp), %rbx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x30(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x38(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq (%rsi), %rbx + movq 0x60(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x70(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x78(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x8(%rsi), %rbx + xorl %r13d, %r13d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rsi), %rbx + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rsi), %rbx + xorl %r8d, %r8d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x40(%rsp), %rbx + movq 0x60(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x70(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x78(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x48(%rsp), %rbx + xorl %r13d, %r13d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rsp), %rbx + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rsp), %rbx + xorl %r8d, %r8d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq (%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq $0x0, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + movq 0x40(%rsi), %rbx + movq 0xa0(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xb0(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0xb8(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x48(%rsi), %rbx + xorl %r13d, %r13d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rsi), %rbx + xorl %r15d, %r15d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rsi), %rbx + xorl %r8d, %r8d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq (%rsp), %rax + subq 0x40(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x48(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x50(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x58(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x80(%rsp), %rax + subq (%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x20(%rsi), %rbx + movq 0x60(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x70(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x78(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rsi), %rbx + xorl %r13d, %r13d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rsi), %rbx + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rsi), %rbx + xorl %r8d, %r8d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x80(%rsp), %rbx + movq 0x20(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x28(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x30(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x38(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x88(%rsp), %rbx + xorl %r13d, %r13d + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x28(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x30(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x38(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x90(%rsp), %rbx + xorl %r15d, %r15d + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x28(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x30(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x38(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x98(%rsp), %rbx + xorl %r8d, %r8d + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x28(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x30(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x38(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x80(%rsp), %rax + subq 0x60(%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x68(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x70(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x78(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x40(%rsi), %rax + movq 0x48(%rsi), %rdx + orq 0x50(%rsi), %rax + orq 0x58(%rsi), %rdx + orq %rdx, %rax + movq (%rsp), %r8 + movq 0x0(%rbp), %rax + cmoveq %rax, %r8 + movq 0x8(%rsp), %r9 + movq 0x8(%rbp), %rax + cmoveq %rax, %r9 + movq 0x10(%rsp), %r10 + movq 0x10(%rbp), %rax + cmoveq %rax, %r10 + movq 0x18(%rsp), %r11 + movq 0x18(%rbp), %rax + cmoveq %rax, %r11 + movq 0x80(%rsp), %r12 + movq 0x20(%rbp), %rax + cmoveq %rax, %r12 + movq 0x88(%rsp), %r13 + movq 0x28(%rbp), %rax + cmoveq %rax, %r13 + movq 0x90(%rsp), %r14 + movq 0x30(%rbp), %rax + cmoveq %rax, %r14 + movq 0x98(%rsp), %r15 + movq 0x38(%rbp), %rax + cmoveq %rax, %r15 + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + movq %r12, 0x20(%rdi) + movq %r13, 0x28(%rdi) + movq %r14, 0x30(%rdi) + movq %r15, 0x38(%rdi) + movq 0xa0(%rsp), %r8 + movq 0xa8(%rsp), %r9 + movq 0xb0(%rsp), %r10 + movq 0xb8(%rsp), %r11 + movl $0x1, %eax + cmoveq %rax, %r8 + movabsq $0xffffffff00000000, %rax + cmoveq %rax, %r9 + movq $0xffffffffffffffff, %rax + cmoveq %rax, %r10 + movl $0xfffffffe, %eax + cmoveq %rax, %r11 + movq %r8, 0x40(%rdi) + movq %r9, 0x48(%rdi) + movq %r10, 0x50(%rdi) + movq %r11, 0x58(%rdi) + addq $0xc0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmulbase.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmulbase.S new file mode 100644 index 00000000000..442dc1331f0 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmulbase.S @@ -0,0 +1,3532 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Scalar multiplication for precomputed point on NIST curve P-256 +// Input scalar[4], blocksize, table[]; output res[8] +// +// extern void p256_scalarmulbase +// (uint64_t res[static 8], +// uint64_t scalar[static 4], +// uint64_t blocksize, +// uint64_t *table); +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-256, the input argument "table" is expected to be a table of +// multiples of the point P in Montgomery-affine form, with each block +// corresponding to "blocksize" bits of the scalar as follows, where +// B = 2^{blocksize-1} (e.g. B = 8 for blocksize = 4): +// +// For each i,j with blocksize * i <= 256 and 1 <= j <= B +// the multiple 2^{blocksize * i} * j * P is stored at +// tab[8 * (B * i + (j - 1))], considered as uint64_t pointers +// or tab + 64 * (B * i + (j - 1)) as byte pointers. +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = blocksize, RCX = table +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = blocksize, R9 = table +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmulbase) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmulbase) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Intermediate variables on the stack. The last z2, z3 values can +// safely be overlaid on "nacc", which is no longer needed at the end. +// Uppercase syntactic variants make x86_att version simpler to generate + +#define RSCALAR (0*NUMSIZE) +#define ACC (1*NUMSIZE) +#define NACC (4*NUMSIZE) +#define TABENT (7*NUMSIZE) +#define Z2 (4*NUMSIZE) +#define Z3 (5*NUMSIZE) + +#define rscalar RSCALAR(%rsp) +#define acc ACC(%rsp) +#define nacc NACC(%rsp) +#define tabent TABENT(%rsp) + +#define z2 Z2(%rsp) +#define z3 Z3(%rsp) + +#define res (9*NUMSIZE)(%rsp) +#define blocksize (9*NUMSIZE+8)(%rsp) +#define table (9*NUMSIZE+16)(%rsp) +#define i (9*NUMSIZE+24)(%rsp) +#define bf (9*NUMSIZE+32)(%rsp) +#define cf (9*NUMSIZE+40)(%rsp) +#define j (9*NUMSIZE+48)(%rsp) + +#define NSPACE (11*NUMSIZE) + +S2N_BN_SYMBOL(p256_scalarmulbase): + _CET_ENDBR + +// The Windows version literally calls the standard ABI version. +// This simplifies the proofs since subroutine offsets are fixed. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + callq p256_scalarmulbase_standard + popq %rsi + popq %rdi + ret + +p256_scalarmulbase_standard: +#endif + +// Real start of the standard ABI code. + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + subq $NSPACE, %rsp + +// Preserve the input arguments except the scalar, since that gets absorbed +// immediately. The "table" value subsequently gets shifted up each iteration +// of the loop, while "res" and "blocksize" are static throughout. + + movq %rdi, res + movq %rdx, blocksize + movq %rcx, table + +// Load the digits of group order n_256 = [%r15;%r14;%r13;%r12] + + movq $0xf3b9cac2fc632551, %r12 + movq $0xbce6faada7179e84, %r13 + movq $0xffffffffffffffff, %r14 + movq $0xffffffff00000000, %r15 + +// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256 +// Store it to "rscalar" (reduced scalar) + + movq (%rsi), %r8 + subq %r12, %r8 + movq 8(%rsi), %r9 + sbbq %r13, %r9 + movq 16(%rsi), %r10 + sbbq %r14, %r10 + movq 24(%rsi), %r11 + sbbq %r15, %r11 + + cmovcq (%rsi), %r8 + cmovcq 8(%rsi), %r9 + cmovcq 16(%rsi), %r10 + cmovcq 24(%rsi), %r11 + + movq %r8, RSCALAR(%rsp) + movq %r9, RSCALAR+8(%rsp) + movq %r10, RSCALAR+16(%rsp) + movq %r11, RSCALAR+24(%rsp) + +// Initialize the accumulator to all zeros and the "carry flag" cf to 0 + + xorl %eax, %eax + + movq %rax, ACC(%rsp) + movq %rax, ACC+8(%rsp) + movq %rax, ACC+16(%rsp) + movq %rax, ACC+24(%rsp) + movq %rax, ACC+32(%rsp) + movq %rax, ACC+40(%rsp) + movq %rax, ACC+48(%rsp) + movq %rax, ACC+56(%rsp) + movq %rax, ACC+64(%rsp) + movq %rax, ACC+72(%rsp) + movq %rax, ACC+80(%rsp) + movq %rax, ACC+88(%rsp) + + movq %rax, cf + +// Main loop over {i >= 0 | blocksize * i <= 256}. Note the non-strict +// inequality, to allow top carry for any choices of blocksize. + + movq %rax, i + +p256_scalarmulbase_loop: + +// The next raw bitfield is bf = bitfield(blocksize * i,blocksize) + cf, +// adding in the deferred carry cf. We then shift the whole scalar right +// by blocksize so we can keep picking bitfield(0,blocksize). + + movq RSCALAR(%rsp), %r8 + movq RSCALAR+8(%rsp), %r9 + movq RSCALAR+16(%rsp), %r10 + movq RSCALAR+24(%rsp), %r11 + + movq blocksize, %rcx + movl $1, %eax + shlq %cl, %rax + decq %rax + andq %r8, %rax + + shrdq %cl, %r9, %r8 + shrdq %cl, %r10, %r9 + shrdq %cl, %r11, %r10 + shrq %cl, %r11 + + addq cf, %rax + movq %rax, bf + + movq %r8, RSCALAR(%rsp) + movq %r9, RSCALAR+8(%rsp) + movq %r10, RSCALAR+16(%rsp) + movq %r11, RSCALAR+24(%rsp) + +// Now if bf <= B we just select entry j, unnegated and set cf = 0. +// If bf > B we set j = 2 * B - bf and negate the j'th entry, setting cf = 1. +// In either case we ultimately add bf, in the latter case with deferred +// carry as 2 * B - (2 * B - bf) = bf. + + movl $1, %eax + movq blocksize, %rcx + shlq %cl, %rax + movq %rax, %rbx + shrq $1, %rax + + subq bf, %rbx + cmpq bf, %rax + + cmovncq bf, %rbx + sbbq %rax, %rax + movq %rbx, j + negq %rax + movq %rax, cf + +// Load table entry j - 1 for nonzero j in constant-time style. + + movq blocksize, %rcx + decq %rcx + movl $1, %esi + shlq %cl, %rsi + movq j, %r12 + movq table, %rbp + +p256_scalarmulbase_tabloop: + subq $1, %r12 + cmovzq (%rbp), %rax + cmovzq 8(%rbp), %rbx + cmovzq 16(%rbp), %rcx + cmovzq 24(%rbp), %rdx + cmovzq 32(%rbp), %r8 + cmovzq 40(%rbp), %r9 + cmovzq 48(%rbp), %r10 + cmovzq 56(%rbp), %r11 + + addq $64, %rbp + decq %rsi + jnz p256_scalarmulbase_tabloop + + movq %rbp, table + +// Before storing back, optionally negate the y coordinate of the table entry + + xorl %r14d, %r14d + leaq -1(%r14), %r12 + movq $0x00000000ffffffff, %r15 + movq %r15, %r13 + negq %r15 + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + + movq cf, %rax + testq %rax, %rax + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + + movq %r8, TABENT+32(%rsp) + movq %r9, TABENT+40(%rsp) + movq %r10, TABENT+48(%rsp) + movq %r11, TABENT+56(%rsp) + +// Add the adjusted table point to the accumulator + + leaq NACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + leaq TABENT(%rsp), %rdx + callq p256_scalarmulbase_local_p256_montjmixadd + +// However, only commit that update to the accumulator if j is nonzero, +// because the mixed addition function does not handle this case directly, +// and in any case we didn't choose the table entry appropriately. + + movq j, %rax + testq %rax, %rax + + movq ACC(%rsp), %rax + cmovnzq NACC(%rsp), %rax + movq %rax, ACC(%rsp) + + movq ACC+8(%rsp), %rax + cmovnzq NACC+8(%rsp), %rax + movq %rax, ACC+8(%rsp) + + movq ACC+16(%rsp), %rax + cmovnzq NACC+16(%rsp), %rax + movq %rax, ACC+16(%rsp) + + movq ACC+24(%rsp), %rax + cmovnzq NACC+24(%rsp), %rax + movq %rax, ACC+24(%rsp) + + movq ACC+32(%rsp), %rax + cmovnzq NACC+32(%rsp), %rax + movq %rax, ACC+32(%rsp) + + movq ACC+40(%rsp), %rax + cmovnzq NACC+40(%rsp), %rax + movq %rax, ACC+40(%rsp) + + movq ACC+48(%rsp), %rax + cmovnzq NACC+48(%rsp), %rax + movq %rax, ACC+48(%rsp) + + movq ACC+56(%rsp), %rax + cmovnzq NACC+56(%rsp), %rax + movq %rax, ACC+56(%rsp) + + movq ACC+64(%rsp), %rax + cmovnzq NACC+64(%rsp), %rax + movq %rax, ACC+64(%rsp) + + movq ACC+72(%rsp), %rax + cmovnzq NACC+72(%rsp), %rax + movq %rax, ACC+72(%rsp) + + movq ACC+80(%rsp), %rax + cmovnzq NACC+80(%rsp), %rax + movq %rax, ACC+80(%rsp) + + movq ACC+88(%rsp), %rax + cmovnzq NACC+88(%rsp), %rax + movq %rax, ACC+88(%rsp) + +// Loop while blocksize * i <= 256 + + movq i, %rax + incq %rax + movq %rax, i + + imulq blocksize, %rax + cmpq $257, %rax + jc p256_scalarmulbase_loop + +// That's the end of the main loop, and we just need to translate +// back from the Jacobian representation to affine. First of all, +// let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form + + leaq Z2(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + callq p256_scalarmulbase_local_montsqr_p256 + + leaq Z3(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + leaq Z2(%rsp), %rdx + callq p256_scalarmulbase_local_montmul_p256 + + leaq Z2(%rsp), %rdi + leaq Z3(%rsp), %rsi + callq p256_scalarmulbase_local_demont_p256 + + leaq Z3(%rsp), %rdi + leaq Z2(%rsp), %rsi + callq p256_scalarmulbase_local_inv_p256 + + leaq Z2(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + leaq Z3(%rsp), %rdx + callq p256_scalarmulbase_local_montmul_p256 + +// Convert back from Jacobian (X, Y, Z) |-> (X/Z^2, Y/Z^3) + + movq res, %rdi + leaq ACC(%rsp), %rsi + leaq Z2(%rsp), %rdx + movq %rdi, %rbx + callq p256_scalarmulbase_local_montmul_p256 + + leaq 32(%rbx), %rdi + leaq ACC+32(%rsp), %rsi + leaq Z3(%rsp), %rdx + callq p256_scalarmulbase_local_montmul_p256 + +// Restore stack and registers and return + + addq $NSPACE, %rsp + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + ret + +// Local copies of subroutines, complete clones at the moment + +p256_scalarmulbase_local_demont_p256: + pushq %rbx + movq (%rsi), %r8 + movq 0x8(%rsi), %r9 + movq 0x10(%rsi), %r10 + movq 0x18(%rsi), %r11 + xorq %rbx, %rbx + xorq %rsi, %rsi + movq $0x100000000, %rdx + mulxq %r8, %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq %r9, %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %rbx + mulxq %r9, %rax, %rcx + adcxq %rax, %rbx + adoxq %rcx, %rsi + movl $0x0, %r8d + adcxq %r8, %rsi + xorq %r9, %r9 + movq $0x100000000, %rdx + mulxq %r10, %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %rbx + mulxq %r11, %rax, %rcx + adcxq %rax, %rbx + adoxq %rcx, %rsi + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rcx + adcxq %rax, %rsi + adoxq %rcx, %r8 + mulxq %r11, %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + movl $0x0, %r10d + adcxq %r10, %r9 + movq %rbx, (%rdi) + movq %rsi, 0x8(%rdi) + movq %r8, 0x10(%rdi) + movq %r9, 0x18(%rdi) + popq %rbx + ret + +p256_scalarmulbase_local_inv_p256: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xf0, %rsp + movq %rdi, 0xe0(%rsp) + xorl %ecx, %ecx + movl $0xffffffff, %edx + movq %rdx, %rbx + leaq -0x1(%rcx), %rax + negq %rdx + movq %rax, (%rsp) + movq %rbx, 0x8(%rsp) + movq %rcx, 0x10(%rsp) + movq %rdx, 0x18(%rsp) + movq %rcx, 0x20(%rsp) + movq (%rsi), %r8 + movq 0x8(%rsi), %r9 + movq 0x10(%rsi), %r10 + movq 0x18(%rsi), %r11 + leaq 0x1(%rcx), %rax + addq %r8, %rax + leaq -0x1(%rdx), %rbx + adcq %r9, %rbx + notq %rcx + adcq %r10, %rcx + notq %rdx + adcq %r11, %rdx + cmovaeq %r8, %rax + cmovaeq %r9, %rbx + cmovaeq %r10, %rcx + cmovaeq %r11, %rdx + movq %rax, 0x28(%rsp) + movq %rbx, 0x30(%rsp) + movq %rcx, 0x38(%rsp) + movq %rdx, 0x40(%rsp) + xorl %eax, %eax + movq %rax, 0x48(%rsp) + xorl %eax, %eax + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movq %rax, 0x60(%rsp) + movq %rax, 0x68(%rsp) + movq $0x4000000000000, %rcx + movq %rcx, 0x78(%rsp) + movq %rax, 0x80(%rsp) + movq %rax, 0x88(%rsp) + movq %rax, 0x90(%rsp) + movq $0xa, 0xb0(%rsp) + movq $0x1, 0xb8(%rsp) + jmp p256_scalarmulbase_inv_midloop +p256_scalarmulbase_inv_loop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0xa0(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0xa8(%rsp) + xorl %ebx, %ebx + movq (%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + xorl %ecx, %ecx + movq 0x8(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x28(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x38(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x30(%rsp) + movq 0x18(%rsp), %rax + xorq %r9, %rax + movq 0x20(%rsp), %rbp + xorq %r9, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x40(%rsp), %rax + xorq %r11, %rax + movq 0x48(%rsp), %rdx + xorq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + sarq $0x3b, %rbp + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + movq 0x20(%rsp), %rsi + movq %rbp, 0x20(%rsp) + xorq %r13, %rax + xorq %r13, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x40(%rsp), %rax + xorq %r15, %rax + movq 0x48(%rsp), %rdx + xorq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x38(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x40(%rsp) + sarq $0x3b, %rsi + movq %rsi, 0x48(%rsp) + movq 0xa0(%rsp), %rbx + movq 0xa8(%rsp), %rbp + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x78(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x78(%rsp) + xorl %ebx, %ebx + movq 0x58(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x80(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x58(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x58(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x80(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x80(%rsp) + xorl %ecx, %ecx + movq 0x60(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x88(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x60(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x60(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x88(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x88(%rsp) + movq 0x68(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x90(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq 0x68(%rsp), %rax + movq %rcx, 0x68(%rsp) + movq %rdx, 0x70(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq 0x90(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rsi, 0x90(%rsp) + movq %rdx, 0x98(%rsp) + movq $0xe000000000000000, %r8 + addq 0x50(%rsp), %r8 + movq $0xffffffffffffffff, %r9 + adcq 0x58(%rsp), %r9 + movq $0x1fffffff, %r10 + adcq 0x60(%rsp), %r10 + movq $0x2000000000000000, %r11 + adcq 0x68(%rsp), %r11 + movq $0x1fffffffe0000000, %r12 + adcq 0x70(%rsp), %r12 + movq %r8, %rbx + shlq $0x20, %rbx + movq $0xffffffff00000001, %rax + mulq %r8 + shrq $0x20, %r8 + addq %rbx, %r9 + adcq %r8, %r10 + adcq %rax, %r11 + adcq %rdx, %r12 + sbbq %rax, %rax + movl $0xffffffff, %ebx + andq %rax, %rbx + movq $0xffffffff00000001, %rdx + andq %rax, %rdx + subq %rax, %r9 + movq %r9, 0x50(%rsp) + sbbq %rbx, %r10 + movq %r10, 0x58(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x60(%rsp) + sbbq %rdx, %r12 + movq %r12, 0x68(%rsp) + movq $0xe000000000000000, %r8 + addq 0x78(%rsp), %r8 + movq $0xffffffffffffffff, %r9 + adcq 0x80(%rsp), %r9 + movq $0x1fffffff, %r10 + adcq 0x88(%rsp), %r10 + movq $0x2000000000000000, %r11 + adcq 0x90(%rsp), %r11 + movq $0x1fffffffe0000000, %r12 + adcq 0x98(%rsp), %r12 + movq %r8, %rbx + shlq $0x20, %rbx + movq $0xffffffff00000001, %rax + mulq %r8 + shrq $0x20, %r8 + addq %rbx, %r9 + adcq %r8, %r10 + adcq %rax, %r11 + adcq %rdx, %r12 + sbbq %rax, %rax + movl $0xffffffff, %ebx + andq %rax, %rbx + movq $0xffffffff00000001, %rdx + andq %rax, %rdx + subq %rax, %r9 + movq %r9, 0x78(%rsp) + sbbq %rbx, %r10 + movq %r10, 0x80(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x88(%rsp) + sbbq %rdx, %r12 + movq %r12, 0x90(%rsp) +p256_scalarmulbase_inv_midloop: + movq 0xb8(%rsp), %rsi + movq (%rsp), %rdx + movq 0x28(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xc0(%rsp) + movq %rbx, 0xc8(%rsp) + movq %rdi, 0xd0(%rsp) + movq %rcx, 0xd8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x28(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xc0(%rsp), %rax + imulq %r8, %rax + movq 0xd0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xc8(%rsp), %r8 + imulq 0xd8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xc0(%rsp), %rax + imulq %r10, %rax + movq 0xd0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xc8(%rsp), %r10 + imulq 0xd8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0xb8(%rsp) + decq 0xb0(%rsp) + jne p256_scalarmulbase_inv_loop + movq (%rsp), %rax + movq 0x28(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x78(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x58(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x80(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x88(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x68(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x90(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r12, 0x50(%rsp) + movq %r13, 0x58(%rsp) + movq %r14, 0x60(%rsp) + movq %r15, 0x68(%rsp) + movq %r9, 0x70(%rsp) + movq $0xe000000000000000, %r8 + addq 0x50(%rsp), %r8 + movq $0xffffffffffffffff, %r9 + adcq 0x58(%rsp), %r9 + movq $0x1fffffff, %r10 + adcq 0x60(%rsp), %r10 + movq $0x2000000000000000, %r11 + adcq 0x68(%rsp), %r11 + movq $0x1fffffffe0000000, %r12 + adcq 0x70(%rsp), %r12 + movq %r8, %rbx + shlq $0x20, %rbx + movq $0xffffffff00000001, %rax + mulq %r8 + shrq $0x20, %r8 + addq %rbx, %r9 + adcq %r8, %r10 + adcq %rax, %r11 + adcq %rdx, %r12 + sbbq %rax, %rax + movl $0xffffffff, %ebx + andq %rax, %rbx + movq $0xffffffff00000001, %rdx + andq %rax, %rdx + subq %rax, %r9 + movq %r9, 0x50(%rsp) + sbbq %rbx, %r10 + movq %r10, 0x58(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x60(%rsp) + sbbq %rdx, %r12 + movq %r12, 0x68(%rsp) + movq 0x50(%rsp), %r8 + movq 0x58(%rsp), %r9 + movq 0x60(%rsp), %r10 + movq 0x68(%rsp), %r11 + movl $0x1, %eax + movl $0xffffffff, %ebx + leaq -0x2(%rax), %rcx + leaq -0x1(%rbx), %rdx + notq %rbx + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + cmovaeq %r8, %rax + cmovaeq %r9, %rbx + cmovaeq %r10, %rcx + cmovaeq %r11, %rdx + movq 0xe0(%rsp), %rdi + movq %rax, (%rdi) + movq %rbx, 0x8(%rdi) + movq %rcx, 0x10(%rdi) + movq %rdx, 0x18(%rdi) + addq $0xf0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p256_scalarmulbase_local_montmul_p256: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx, %rcx + xorl %r13d, %r13d + movq (%rcx), %rdx + mulxq (%rsi), %r8, %r9 + mulxq 0x8(%rsi), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsi), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsi), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x8(%rcx), %rdx + xorl %r14d, %r14d + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x10(%rcx), %rdx + xorl %r8d, %r8d + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x18(%rsi), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x18(%rcx), %rdx + xorl %r9d, %r9d + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x18(%rsi), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret + +p256_scalarmulbase_local_montsqr_p256: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq (%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x8(%rsi), %r9, %r10 + mulxq 0x18(%rsi), %r11, %r12 + movq 0x10(%rsi), %rdx + mulxq 0x18(%rsi), %r13, %r14 + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x18(%rsi), %rdx + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x8(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x10(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x18(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + movl %ebp, %r9d + adoxq %rbp, %r9 + adcxq %rbp, %r9 + addq %r9, %r14 + adcq %rbp, %r15 + movl %ebp, %r8d + adcq %rbp, %r8 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rbp, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbp, %r14 + cmovbq %r11, %r15 + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p256_scalarmulbase_local_p256_montjmixadd: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xc0, %rsp + movq %rdx, %rbp + movq 0x40(%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rsi), %r9, %r10 + mulxq 0x58(%rsi), %r11, %r12 + movq 0x50(%rsi), %rdx + mulxq 0x58(%rsi), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rsi), %rdx + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %r8d + leaq -0x1(%rdx), %rdx + leaq -0x1(%rcx), %rax + movl $0xfffffffe, %r11d + cmoveq %rcx, %r8 + cmoveq %rcx, %rdx + cmoveq %rcx, %rax + cmoveq %rcx, %r11 + addq %r8, %r12 + adcq %rdx, %r13 + adcq %rax, %r14 + adcq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + xorl %r13d, %r13d + movq 0x20(%rbp), %rdx + mulxq 0x40(%rsi), %r8, %r9 + mulxq 0x48(%rsi), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x50(%rsi), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x58(%rsi), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rbp), %rdx + xorl %r14d, %r14d + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x58(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rbp), %rdx + xorl %r8d, %r8d + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x58(%rsi), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rbp), %rdx + xorl %r9d, %r9d + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x58(%rsi), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + xorl %r13d, %r13d + movq 0x0(%rbp), %rdx + mulxq (%rsp), %r8, %r9 + mulxq 0x8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x8(%rbp), %rdx + xorl %r14d, %r14d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x10(%rbp), %rdx + xorl %r8d, %r8d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x18(%rbp), %rdx + xorl %r9d, %r9d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + xorl %r13d, %r13d + movq 0x20(%rsp), %rdx + mulxq (%rsp), %r8, %r9 + mulxq 0x8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rsp), %rdx + xorl %r14d, %r14d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rsp), %rdx + xorl %r8d, %r8d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rsp), %rdx + xorl %r9d, %r9d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq 0x40(%rsp), %rax + subq (%rsi), %rax + movq 0x48(%rsp), %rcx + sbbq 0x8(%rsi), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x10(%rsi), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x18(%rsi), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0xa0(%rsp) + adcq %r10, %rcx + movq %rcx, 0xa8(%rsp) + adcq $0x0, %r8 + movq %r8, 0xb0(%rsp) + adcq %rdx, %r9 + movq %r9, 0xb8(%rsp) + movq 0x20(%rsp), %rax + subq 0x20(%rsi), %rax + movq 0x28(%rsp), %rcx + sbbq 0x28(%rsi), %rcx + movq 0x30(%rsp), %r8 + sbbq 0x30(%rsi), %r8 + movq 0x38(%rsp), %r9 + sbbq 0x38(%rsi), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x20(%rsp) + adcq %r10, %rcx + movq %rcx, 0x28(%rsp) + adcq $0x0, %r8 + movq %r8, 0x30(%rsp) + adcq %rdx, %r9 + movq %r9, 0x38(%rsp) + movq 0xa0(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0xa8(%rsp), %r9, %r10 + mulxq 0xb8(%rsp), %r11, %r12 + movq 0xb0(%rsp), %rdx + mulxq 0xb8(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0xb8(%rsp), %rdx + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0xa8(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0xb0(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0xb8(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %r8d + leaq -0x1(%rdx), %rdx + leaq -0x1(%rcx), %rax + movl $0xfffffffe, %r11d + cmoveq %rcx, %r8 + cmoveq %rcx, %rdx + cmoveq %rcx, %rax + cmoveq %rcx, %r11 + addq %r8, %r12 + adcq %rdx, %r13 + adcq %rax, %r14 + adcq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x20(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x28(%rsp), %r9, %r10 + mulxq 0x38(%rsp), %r11, %r12 + movq 0x30(%rsp), %rdx + mulxq 0x38(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x38(%rsp), %rdx + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x28(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x30(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x38(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %ebx + addq %r12, %rbx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rcx), %rcx + movq %rcx, %rax + adcq %r14, %rcx + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rbx, %r12 + cmovbq %rdx, %r13 + cmovbq %rcx, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + xorl %r13d, %r13d + movq (%rsi), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x70(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x78(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x8(%rsi), %rdx + xorl %r14d, %r14d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x10(%rsi), %rdx + xorl %r8d, %r8d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x18(%rsi), %rdx + xorl %r9d, %r9d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + xorl %r13d, %r13d + movq 0x40(%rsp), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x70(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x78(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x48(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x50(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x58(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq (%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq $0x0, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + xorl %r13d, %r13d + movq 0x40(%rsi), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0xb0(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0xb8(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x48(%rsi), %rdx + xorl %r14d, %r14d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x50(%rsi), %rdx + xorl %r8d, %r8d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x58(%rsi), %rdx + xorl %r9d, %r9d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq (%rsp), %rax + subq 0x40(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x48(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x50(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x58(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x80(%rsp), %rax + subq (%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + xorl %r13d, %r13d + movq 0x20(%rsi), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x70(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x78(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rsi), %rdx + xorl %r14d, %r14d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rsi), %rdx + xorl %r8d, %r8d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rsi), %rdx + xorl %r9d, %r9d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + xorl %r13d, %r13d + movq 0x80(%rsp), %rdx + mulxq 0x20(%rsp), %r8, %r9 + mulxq 0x28(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x30(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x38(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x88(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x38(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x90(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x38(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x98(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x38(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x80(%rsp), %rax + subq 0x60(%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x68(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x70(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x78(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x40(%rsi), %rax + movq 0x48(%rsi), %rdx + orq 0x50(%rsi), %rax + orq 0x58(%rsi), %rdx + orq %rdx, %rax + movq (%rsp), %r8 + movq 0x0(%rbp), %rax + cmoveq %rax, %r8 + movq 0x8(%rsp), %r9 + movq 0x8(%rbp), %rax + cmoveq %rax, %r9 + movq 0x10(%rsp), %r10 + movq 0x10(%rbp), %rax + cmoveq %rax, %r10 + movq 0x18(%rsp), %r11 + movq 0x18(%rbp), %rax + cmoveq %rax, %r11 + movq 0x80(%rsp), %r12 + movq 0x20(%rbp), %rax + cmoveq %rax, %r12 + movq 0x88(%rsp), %r13 + movq 0x28(%rbp), %rax + cmoveq %rax, %r13 + movq 0x90(%rsp), %r14 + movq 0x30(%rbp), %rax + cmoveq %rax, %r14 + movq 0x98(%rsp), %r15 + movq 0x38(%rbp), %rax + cmoveq %rax, %r15 + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + movq %r12, 0x20(%rdi) + movq %r13, 0x28(%rdi) + movq %r14, 0x30(%rdi) + movq %r15, 0x38(%rdi) + movq 0xa0(%rsp), %r8 + movq 0xa8(%rsp), %r9 + movq 0xb0(%rsp), %r10 + movq 0xb8(%rsp), %r11 + movl $0x1, %eax + cmoveq %rax, %r8 + movq $0xffffffff00000000, %rax + cmoveq %rax, %r9 + movq $0xffffffffffffffff, %rax + cmoveq %rax, %r10 + movl $0xfffffffe, %eax + cmoveq %rax, %r11 + movq %r8, 0x40(%rdi) + movq %r9, 0x48(%rdi) + movq %r10, 0x50(%rdi) + movq %r11, 0x58(%rdi) + addq $0xc0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmulbase_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmulbase_alt.S new file mode 100644 index 00000000000..14191e4d55e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p256/p256_scalarmulbase_alt.S @@ -0,0 +1,4173 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Scalar multiplication for precomputed point on NIST curve P-256 +// Input scalar[4], blocksize, table[]; output res[8] +// +// extern void p256_scalarmulbase_alt +// (uint64_t res[static 8], +// uint64_t scalar[static 4], +// uint64_t blocksize, +// uint64_t *table); +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-256, the input argument "table" is expected to be a table of +// multiples of the point P in Montgomery-affine form, with each block +// corresponding to "blocksize" bits of the scalar as follows, where +// B = 2^{blocksize-1} (e.g. B = 8 for blocksize = 4): +// +// For each i,j with blocksize * i <= 256 and 1 <= j <= B +// the multiple 2^{blocksize * i} * j * P is stored at +// tab[8 * (B * i + (j - 1))], considered as uint64_t pointers +// or tab + 64 * (B * i + (j - 1)) as byte pointers. +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = blocksize, RCX = table +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = blocksize, R9 = table +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_scalarmulbase_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_scalarmulbase_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Intermediate variables on the stack. The last z2, z3 values can +// safely be overlaid on "nacc", which is no longer needed at the end. +// Uppercase syntactic variants make x86_att version simpler to generate + +#define RSCALAR (0*NUMSIZE) +#define ACC (1*NUMSIZE) +#define NACC (4*NUMSIZE) +#define TABENT (7*NUMSIZE) +#define Z2 (4*NUMSIZE) +#define Z3 (5*NUMSIZE) + +#define rscalar RSCALAR(%rsp) +#define acc ACC(%rsp) +#define nacc NACC(%rsp) +#define tabent TABENT(%rsp) + +#define z2 Z2(%rsp) +#define z3 Z3(%rsp) + +#define res (9*NUMSIZE)(%rsp) +#define blocksize (9*NUMSIZE+8)(%rsp) +#define table (9*NUMSIZE+16)(%rsp) +#define i (9*NUMSIZE+24)(%rsp) +#define bf (9*NUMSIZE+32)(%rsp) +#define cf (9*NUMSIZE+40)(%rsp) +#define j (9*NUMSIZE+48)(%rsp) + +#define NSPACE (11*NUMSIZE) + +S2N_BN_SYMBOL(p256_scalarmulbase_alt): + _CET_ENDBR + +// The Windows version literally calls the standard ABI version. +// This simplifies the proofs since subroutine offsets are fixed. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx + callq p256_scalarmulbase_alt_standard + popq %rsi + popq %rdi + ret + +p256_scalarmulbase_alt_standard: +#endif + +// Real start of the standard ABI code. + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + subq $NSPACE, %rsp + +// Preserve the input arguments except the scalar, since that gets absorbed +// immediately. The "table" value subsequently gets shifted up each iteration +// of the loop, while "res" and "blocksize" are static throughout. + + movq %rdi, res + movq %rdx, blocksize + movq %rcx, table + +// Load the digits of group order n_256 = [%r15;%r14;%r13;%r12] + + movq $0xf3b9cac2fc632551, %r12 + movq $0xbce6faada7179e84, %r13 + movq $0xffffffffffffffff, %r14 + movq $0xffffffff00000000, %r15 + +// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256 +// Store it to "rscalar" (reduced scalar) + + movq (%rsi), %r8 + subq %r12, %r8 + movq 8(%rsi), %r9 + sbbq %r13, %r9 + movq 16(%rsi), %r10 + sbbq %r14, %r10 + movq 24(%rsi), %r11 + sbbq %r15, %r11 + + cmovcq (%rsi), %r8 + cmovcq 8(%rsi), %r9 + cmovcq 16(%rsi), %r10 + cmovcq 24(%rsi), %r11 + + movq %r8, RSCALAR(%rsp) + movq %r9, RSCALAR+8(%rsp) + movq %r10, RSCALAR+16(%rsp) + movq %r11, RSCALAR+24(%rsp) + +// Initialize the accumulator to all zeros and the "carry flag" cf to 0 + + xorl %eax, %eax + + movq %rax, ACC(%rsp) + movq %rax, ACC+8(%rsp) + movq %rax, ACC+16(%rsp) + movq %rax, ACC+24(%rsp) + movq %rax, ACC+32(%rsp) + movq %rax, ACC+40(%rsp) + movq %rax, ACC+48(%rsp) + movq %rax, ACC+56(%rsp) + movq %rax, ACC+64(%rsp) + movq %rax, ACC+72(%rsp) + movq %rax, ACC+80(%rsp) + movq %rax, ACC+88(%rsp) + + movq %rax, cf + +// Main loop over {i >= 0 | blocksize * i <= 256}. Note the non-strict +// inequality, to allow top carry for any choices of blocksize. + + movq %rax, i + +p256_scalarmulbase_alt_loop: + +// The next raw bitfield is bf = bitfield(blocksize * i,blocksize) + cf, +// adding in the deferred carry cf. We then shift the whole scalar right +// by blocksize so we can keep picking bitfield(0,blocksize). + + movq RSCALAR(%rsp), %r8 + movq RSCALAR+8(%rsp), %r9 + movq RSCALAR+16(%rsp), %r10 + movq RSCALAR+24(%rsp), %r11 + + movq blocksize, %rcx + movl $1, %eax + shlq %cl, %rax + decq %rax + andq %r8, %rax + + shrdq %cl, %r9, %r8 + shrdq %cl, %r10, %r9 + shrdq %cl, %r11, %r10 + shrq %cl, %r11 + + addq cf, %rax + movq %rax, bf + + movq %r8, RSCALAR(%rsp) + movq %r9, RSCALAR+8(%rsp) + movq %r10, RSCALAR+16(%rsp) + movq %r11, RSCALAR+24(%rsp) + +// Now if bf <= B we just select entry j, unnegated and set cf = 0. +// If bf > B we set j = 2 * B - bf and negate the j'th entry, setting cf = 1. +// In either case we ultimately add bf, in the latter case with deferred +// carry as 2 * B - (2 * B - bf) = bf. + + movl $1, %eax + movq blocksize, %rcx + shlq %cl, %rax + movq %rax, %rbx + shrq $1, %rax + + subq bf, %rbx + cmpq bf, %rax + + cmovncq bf, %rbx + sbbq %rax, %rax + movq %rbx, j + negq %rax + movq %rax, cf + +// Load table entry j - 1 for nonzero j in constant-time style. + + movq blocksize, %rcx + decq %rcx + movl $1, %esi + shlq %cl, %rsi + movq j, %r12 + movq table, %rbp + +p256_scalarmulbase_alt_tabloop: + subq $1, %r12 + cmovzq (%rbp), %rax + cmovzq 8(%rbp), %rbx + cmovzq 16(%rbp), %rcx + cmovzq 24(%rbp), %rdx + cmovzq 32(%rbp), %r8 + cmovzq 40(%rbp), %r9 + cmovzq 48(%rbp), %r10 + cmovzq 56(%rbp), %r11 + + addq $64, %rbp + decq %rsi + jnz p256_scalarmulbase_alt_tabloop + + movq %rbp, table + +// Before storing back, optionally negate the y coordinate of the table entry + + xorl %r14d, %r14d + leaq -1(%r14), %r12 + movq $0x00000000ffffffff, %r15 + movq %r15, %r13 + negq %r15 + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + + movq cf, %rax + testq %rax, %rax + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + + movq %r8, TABENT+32(%rsp) + movq %r9, TABENT+40(%rsp) + movq %r10, TABENT+48(%rsp) + movq %r11, TABENT+56(%rsp) + +// Add the adjusted table point to the accumulator + + leaq NACC(%rsp), %rdi + leaq ACC(%rsp), %rsi + leaq TABENT(%rsp), %rdx + callq p256_scalarmulbase_alt_local_p256_montjmixadd + +// However, only commit that update to the accumulator if j is nonzero, +// because the mixed addition function does not handle this case directly, +// and in any case we didn't choose the table entry appropriately. + + movq j, %rax + testq %rax, %rax + + movq ACC(%rsp), %rax + cmovnzq NACC(%rsp), %rax + movq %rax, ACC(%rsp) + + movq ACC+8(%rsp), %rax + cmovnzq NACC+8(%rsp), %rax + movq %rax, ACC+8(%rsp) + + movq ACC+16(%rsp), %rax + cmovnzq NACC+16(%rsp), %rax + movq %rax, ACC+16(%rsp) + + movq ACC+24(%rsp), %rax + cmovnzq NACC+24(%rsp), %rax + movq %rax, ACC+24(%rsp) + + movq ACC+32(%rsp), %rax + cmovnzq NACC+32(%rsp), %rax + movq %rax, ACC+32(%rsp) + + movq ACC+40(%rsp), %rax + cmovnzq NACC+40(%rsp), %rax + movq %rax, ACC+40(%rsp) + + movq ACC+48(%rsp), %rax + cmovnzq NACC+48(%rsp), %rax + movq %rax, ACC+48(%rsp) + + movq ACC+56(%rsp), %rax + cmovnzq NACC+56(%rsp), %rax + movq %rax, ACC+56(%rsp) + + movq ACC+64(%rsp), %rax + cmovnzq NACC+64(%rsp), %rax + movq %rax, ACC+64(%rsp) + + movq ACC+72(%rsp), %rax + cmovnzq NACC+72(%rsp), %rax + movq %rax, ACC+72(%rsp) + + movq ACC+80(%rsp), %rax + cmovnzq NACC+80(%rsp), %rax + movq %rax, ACC+80(%rsp) + + movq ACC+88(%rsp), %rax + cmovnzq NACC+88(%rsp), %rax + movq %rax, ACC+88(%rsp) + +// Loop while blocksize * i <= 256 + + movq i, %rax + incq %rax + movq %rax, i + + imulq blocksize, %rax + cmpq $257, %rax + jc p256_scalarmulbase_alt_loop + +// That's the end of the main loop, and we just need to translate +// back from the Jacobian representation to affine. First of all, +// let z2 = 1/z^2 and z3 = 1/z^3, both without Montgomery form + + leaq Z2(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + callq p256_scalarmulbase_alt_local_montsqr_p256 + + leaq Z3(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + leaq Z2(%rsp), %rdx + callq p256_scalarmulbase_alt_local_montmul_p256 + + leaq Z2(%rsp), %rdi + leaq Z3(%rsp), %rsi + callq p256_scalarmulbase_alt_local_demont_p256 + + leaq Z3(%rsp), %rdi + leaq Z2(%rsp), %rsi + callq p256_scalarmulbase_alt_local_inv_p256 + + leaq Z2(%rsp), %rdi + leaq ACC+64(%rsp), %rsi + leaq Z3(%rsp), %rdx + callq p256_scalarmulbase_alt_local_montmul_p256 + +// Convert back from Jacobian (X, Y, Z) |-> (X/Z^2, Y/Z^3) + + movq res, %rdi + leaq ACC(%rsp), %rsi + leaq Z2(%rsp), %rdx + movq %rdi, %rbx + callq p256_scalarmulbase_alt_local_montmul_p256 + + leaq 32(%rbx), %rdi + leaq ACC+32(%rsp), %rsi + leaq Z3(%rsp), %rdx + callq p256_scalarmulbase_alt_local_montmul_p256 + +// Restore stack and registers and return + + addq $NSPACE, %rsp + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + ret + +// Local copies of subroutines, complete clones at the moment + +p256_scalarmulbase_alt_local_demont_p256: + movq (%rsi), %r8 + movq 0x8(%rsi), %r9 + movq 0x10(%rsi), %r10 + movq 0x18(%rsi), %r11 + movabsq $0x100000000, %rcx + movq %r8, %rax + mulq %rcx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rsi, %rsi + movq %r9, %rax + mulq %rcx + subq %rsi, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rsi, %rsi + negq %rcx + negq %rsi + incq %rcx + movq %r8, %rax + mulq %rcx + addq %rax, %r11 + adcq %rdx, %rsi + sbbq %r8, %r8 + negq %r8 + movq %r9, %rax + mulq %rcx + addq %rax, %rsi + adcq %rdx, %r8 + negq %rcx + incq %rcx + movq %r10, %rax + mulq %rcx + addq %rax, %r11 + adcq %rdx, %rsi + sbbq %r9, %r9 + movq %r11, %rax + mulq %rcx + subq %r9, %rdx + addq %rax, %rsi + adcq %rdx, %r8 + sbbq %r9, %r9 + negq %rcx + negq %r9 + incq %rcx + movq %r10, %rax + mulq %rcx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %r10, %r10 + negq %r10 + movq %r11, %rax + mulq %rcx + addq %rax, %r9 + adcq %rdx, %r10 + movq %rsi, (%rdi) + movq %r8, 0x8(%rdi) + movq %r9, 0x10(%rdi) + movq %r10, 0x18(%rdi) + ret + +p256_scalarmulbase_alt_local_inv_p256: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xf0, %rsp + movq %rdi, 0xe0(%rsp) + xorl %ecx, %ecx + movl $0xffffffff, %edx + movq %rdx, %rbx + leaq -0x1(%rcx), %rax + negq %rdx + movq %rax, (%rsp) + movq %rbx, 0x8(%rsp) + movq %rcx, 0x10(%rsp) + movq %rdx, 0x18(%rsp) + movq %rcx, 0x20(%rsp) + movq (%rsi), %r8 + movq 0x8(%rsi), %r9 + movq 0x10(%rsi), %r10 + movq 0x18(%rsi), %r11 + leaq 0x1(%rcx), %rax + addq %r8, %rax + leaq -0x1(%rdx), %rbx + adcq %r9, %rbx + notq %rcx + adcq %r10, %rcx + notq %rdx + adcq %r11, %rdx + cmovaeq %r8, %rax + cmovaeq %r9, %rbx + cmovaeq %r10, %rcx + cmovaeq %r11, %rdx + movq %rax, 0x28(%rsp) + movq %rbx, 0x30(%rsp) + movq %rcx, 0x38(%rsp) + movq %rdx, 0x40(%rsp) + xorl %eax, %eax + movq %rax, 0x48(%rsp) + xorl %eax, %eax + movq %rax, 0x50(%rsp) + movq %rax, 0x58(%rsp) + movq %rax, 0x60(%rsp) + movq %rax, 0x68(%rsp) + movabsq $0x4000000000000, %rcx + movq %rcx, 0x78(%rsp) + movq %rax, 0x80(%rsp) + movq %rax, 0x88(%rsp) + movq %rax, 0x90(%rsp) + movq $0xa, 0xb0(%rsp) + movq $0x1, 0xb8(%rsp) + jmp p256_scalarmulbase_alt_inv_midloop +p256_scalarmulbase_alt_inv_loop: + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, 0xa0(%rsp) + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, 0xa8(%rsp) + xorl %ebx, %ebx + movq (%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x28(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + xorl %ebp, %ebp + movq (%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x28(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + xorl %ecx, %ecx + movq 0x8(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x30(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $0x3b, %rbx, %rdi + movq %rdi, (%rsp) + xorl %edi, %edi + movq 0x8(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq 0x30(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $0x3b, %rbp, %rsi + movq %rsi, 0x28(%rsp) + xorl %esi, %esi + movq 0x10(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq 0x38(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $0x3b, %rcx, %rbx + movq %rbx, 0x8(%rsp) + xorl %ebx, %ebx + movq 0x10(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq 0x38(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $0x3b, %rdi, %rbp + movq %rbp, 0x30(%rsp) + movq 0x18(%rsp), %rax + xorq %r9, %rax + movq 0x20(%rsp), %rbp + xorq %r9, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x40(%rsp), %rax + xorq %r11, %rax + movq 0x48(%rsp), %rdx + xorq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $0x3b, %rsi, %rcx + movq %rcx, 0x10(%rsp) + shrdq $0x3b, %rbp, %rsi + sarq $0x3b, %rbp + movq 0x18(%rsp), %rax + movq %rsi, 0x18(%rsp) + movq 0x20(%rsp), %rsi + movq %rbp, 0x20(%rsp) + xorq %r13, %rax + xorq %r13, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq 0x40(%rsp), %rax + xorq %r15, %rax + movq 0x48(%rsp), %rdx + xorq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $0x3b, %rbx, %rdi + movq %rdi, 0x38(%rsp) + shrdq $0x3b, %rsi, %rbx + movq %rbx, 0x40(%rsp) + sarq $0x3b, %rsi + movq %rsi, 0x48(%rsp) + movq 0xa0(%rsp), %rbx + movq 0xa8(%rsp), %rbp + xorl %ecx, %ecx + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x78(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x50(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x50(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x78(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x78(%rsp) + xorl %ebx, %ebx + movq 0x58(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x80(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + xorl %ebp, %ebp + movq 0x58(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, 0x58(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq 0x80(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, 0x80(%rsp) + xorl %ecx, %ecx + movq 0x60(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq 0x88(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + xorl %esi, %esi + movq 0x60(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, 0x60(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq 0x88(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, 0x88(%rsp) + movq 0x68(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq 0x90(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + movq 0x68(%rsp), %rax + movq %rcx, 0x68(%rsp) + movq %rdx, 0x70(%rsp) + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq 0x90(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rsi, 0x90(%rsp) + movq %rdx, 0x98(%rsp) + movabsq $0xe000000000000000, %r8 + addq 0x50(%rsp), %r8 + movq $0xffffffffffffffff, %r9 + adcq 0x58(%rsp), %r9 + movq $0x1fffffff, %r10 + adcq 0x60(%rsp), %r10 + movabsq $0x2000000000000000, %r11 + adcq 0x68(%rsp), %r11 + movabsq $0x1fffffffe0000000, %r12 + adcq 0x70(%rsp), %r12 + movq %r8, %rbx + shlq $0x20, %rbx + movabsq $0xffffffff00000001, %rax + mulq %r8 + shrq $0x20, %r8 + addq %rbx, %r9 + adcq %r8, %r10 + adcq %rax, %r11 + adcq %rdx, %r12 + sbbq %rax, %rax + movl $0xffffffff, %ebx + andq %rax, %rbx + movabsq $0xffffffff00000001, %rdx + andq %rax, %rdx + subq %rax, %r9 + movq %r9, 0x50(%rsp) + sbbq %rbx, %r10 + movq %r10, 0x58(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x60(%rsp) + sbbq %rdx, %r12 + movq %r12, 0x68(%rsp) + movabsq $0xe000000000000000, %r8 + addq 0x78(%rsp), %r8 + movq $0xffffffffffffffff, %r9 + adcq 0x80(%rsp), %r9 + movq $0x1fffffff, %r10 + adcq 0x88(%rsp), %r10 + movabsq $0x2000000000000000, %r11 + adcq 0x90(%rsp), %r11 + movabsq $0x1fffffffe0000000, %r12 + adcq 0x98(%rsp), %r12 + movq %r8, %rbx + shlq $0x20, %rbx + movabsq $0xffffffff00000001, %rax + mulq %r8 + shrq $0x20, %r8 + addq %rbx, %r9 + adcq %r8, %r10 + adcq %rax, %r11 + adcq %rdx, %r12 + sbbq %rax, %rax + movl $0xffffffff, %ebx + andq %rax, %rbx + movabsq $0xffffffff00000001, %rdx + andq %rax, %rdx + subq %rax, %r9 + movq %r9, 0x78(%rsp) + sbbq %rbx, %r10 + movq %r10, 0x80(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x88(%rsp) + sbbq %rdx, %r12 + movq %r12, 0x90(%rsp) +p256_scalarmulbase_alt_inv_midloop: + movq 0xb8(%rsp), %rsi + movq (%rsp), %rdx + movq 0x28(%rsp), %rcx + movq %rdx, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + xorl %ebp, %ebp + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %rdx + leaq (%rcx,%rax), %rdi + shlq $0x16, %rdx + shlq $0x16, %rdi + sarq $0x2b, %rdx + sarq $0x2b, %rdi + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %rbx + leaq (%rcx,%rax), %rcx + sarq $0x2a, %rbx + sarq $0x2a, %rcx + movq %rdx, 0xc0(%rsp) + movq %rbx, 0xc8(%rsp) + movq %rdi, 0xd0(%rsp) + movq %rcx, 0xd8(%rsp) + movq (%rsp), %r12 + imulq %r12, %rdi + imulq %rdx, %r12 + movq 0x28(%rsp), %r13 + imulq %r13, %rbx + imulq %rcx, %r13 + addq %rbx, %r12 + addq %rdi, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r10 + shlq $0x16, %r8 + shlq $0x16, %r10 + sarq $0x2b, %r8 + sarq $0x2b, %r10 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r15 + leaq (%rcx,%rax), %r11 + sarq $0x2a, %r15 + sarq $0x2a, %r11 + movq %r13, %rbx + movq %r12, %rcx + imulq %r8, %r12 + imulq %r15, %rbx + addq %rbx, %r12 + imulq %r11, %r13 + imulq %r10, %rcx + addq %rcx, %r13 + sarq $0x14, %r12 + sarq $0x14, %r13 + movq %r12, %rbx + andq $0xfffff, %rbx + movabsq $0xfffffe0000000000, %rax + orq %rax, %rbx + movq %r13, %rcx + andq $0xfffff, %rcx + movabsq $0xc000000000000000, %rax + orq %rax, %rcx + movq 0xc0(%rsp), %rax + imulq %r8, %rax + movq 0xd0(%rsp), %rdx + imulq %r15, %rdx + imulq 0xc8(%rsp), %r8 + imulq 0xd8(%rsp), %r15 + addq %r8, %r15 + leaq (%rax,%rdx), %r9 + movq 0xc0(%rsp), %rax + imulq %r10, %rax + movq 0xd0(%rsp), %rdx + imulq %r11, %rdx + imulq 0xc8(%rsp), %r10 + imulq 0xd8(%rsp), %r11 + addq %r10, %r11 + leaq (%rax,%rdx), %r13 + movq $0xfffffffffffffffe, %rax + movl $0x2, %edx + movq %rbx, %rdi + movq %rax, %r8 + testq %rsi, %rsi + cmovs %rbp, %r8 + testq $0x1, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + cmovs %rbp, %r8 + movq %rbx, %rdi + testq %rdx, %rcx + cmoveq %rbp, %r8 + cmoveq %rbp, %rdi + sarq $1, %rcx + xorq %r8, %rdi + xorq %r8, %rsi + btq $0x3f, %r8 + cmovbq %rcx, %rbx + movq %rax, %r8 + subq %rax, %rsi + leaq (%rcx,%rdi), %rcx + sarq $1, %rcx + movl $0x100000, %eax + leaq (%rbx,%rax), %r8 + leaq (%rcx,%rax), %r12 + shlq $0x15, %r8 + shlq $0x15, %r12 + sarq $0x2b, %r8 + sarq $0x2b, %r12 + movabsq $0x20000100000, %rax + leaq (%rbx,%rax), %r10 + leaq (%rcx,%rax), %r14 + sarq $0x2b, %r10 + sarq $0x2b, %r14 + movq %r9, %rax + imulq %r8, %rax + movq %r13, %rdx + imulq %r10, %rdx + imulq %r15, %r8 + imulq %r11, %r10 + addq %r8, %r10 + leaq (%rax,%rdx), %r8 + movq %r9, %rax + imulq %r12, %rax + movq %r13, %rdx + imulq %r14, %rdx + imulq %r15, %r12 + imulq %r11, %r14 + addq %r12, %r14 + leaq (%rax,%rdx), %r12 + movq %rsi, 0xb8(%rsp) + decq 0xb0(%rsp) + jne p256_scalarmulbase_alt_inv_loop + movq (%rsp), %rax + movq 0x28(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $0x3f, %rax + movq %r8, %r9 + sarq $0x3f, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + movq %r10, %r11 + sarq $0x3f, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + movq %r12, %r13 + sarq $0x3f, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + movq %r14, %r15 + sarq $0x3f, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + xorl %r13d, %r13d + movq 0x50(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x78(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movq 0x58(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x80(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x88(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x68(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq 0x90(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + movq %r12, 0x50(%rsp) + movq %r13, 0x58(%rsp) + movq %r14, 0x60(%rsp) + movq %r15, 0x68(%rsp) + movq %r9, 0x70(%rsp) + movabsq $0xe000000000000000, %r8 + addq 0x50(%rsp), %r8 + movq $0xffffffffffffffff, %r9 + adcq 0x58(%rsp), %r9 + movq $0x1fffffff, %r10 + adcq 0x60(%rsp), %r10 + movabsq $0x2000000000000000, %r11 + adcq 0x68(%rsp), %r11 + movabsq $0x1fffffffe0000000, %r12 + adcq 0x70(%rsp), %r12 + movq %r8, %rbx + shlq $0x20, %rbx + movabsq $0xffffffff00000001, %rax + mulq %r8 + shrq $0x20, %r8 + addq %rbx, %r9 + adcq %r8, %r10 + adcq %rax, %r11 + adcq %rdx, %r12 + sbbq %rax, %rax + movl $0xffffffff, %ebx + andq %rax, %rbx + movabsq $0xffffffff00000001, %rdx + andq %rax, %rdx + subq %rax, %r9 + movq %r9, 0x50(%rsp) + sbbq %rbx, %r10 + movq %r10, 0x58(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x60(%rsp) + sbbq %rdx, %r12 + movq %r12, 0x68(%rsp) + movq 0x50(%rsp), %r8 + movq 0x58(%rsp), %r9 + movq 0x60(%rsp), %r10 + movq 0x68(%rsp), %r11 + movl $0x1, %eax + movl $0xffffffff, %ebx + leaq -0x2(%rax), %rcx + leaq -0x1(%rbx), %rdx + notq %rbx + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + cmovaeq %r8, %rax + cmovaeq %r9, %rbx + cmovaeq %r10, %rcx + cmovaeq %r11, %rdx + movq 0xe0(%rsp), %rdi + movq %rax, (%rdi) + movq %rbx, 0x8(%rdi) + movq %rcx, 0x10(%rdi) + movq %rdx, 0x18(%rdi) + addq $0xf0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p256_scalarmulbase_alt_local_montmul_p256: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx, %rcx + movq (%rcx), %rbx + movq (%rsi), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsi), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsi), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsi), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x8(%rcx), %rbx + xorl %r13d, %r13d + movq (%rsi), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rcx), %rbx + xorl %r15d, %r15d + movq (%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rcx), %rbx + xorl %r8d, %r8d + movq (%rsi), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret + +p256_scalarmulbase_alt_local_montsqr_p256: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq (%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x8(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x18(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x10(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq (%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x8(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x18(%rsi), %rbx + movq 0x8(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x8(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x10(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x18(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rdi) + movq %r13, 0x8(%rdi) + movq %r14, 0x10(%rdi) + movq %r15, 0x18(%rdi) + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret + +p256_scalarmulbase_alt_local_p256_montjmixadd: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xc0, %rsp + movq %rdx, %rbp + movq 0x40(%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rsi), %rbx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x20(%rbp), %rbx + movq 0x40(%rsi), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x48(%rsi), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x50(%rsi), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x58(%rsi), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rbp), %rbx + xorl %r13d, %r13d + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x48(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x50(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x58(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rbp), %rbx + xorl %r15d, %r15d + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x48(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x50(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x58(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rbp), %rbx + xorl %r8d, %r8d + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x48(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x50(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x58(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq 0x0(%rbp), %rbx + movq (%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x8(%rbp), %rbx + xorl %r13d, %r13d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rbp), %rbx + xorl %r15d, %r15d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rbp), %rbx + xorl %r8d, %r8d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq 0x20(%rsp), %rbx + movq (%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rsp), %rbx + xorl %r13d, %r13d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rsp), %rbx + xorl %r15d, %r15d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rsp), %rbx + xorl %r8d, %r8d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq 0x40(%rsp), %rax + subq (%rsi), %rax + movq 0x48(%rsp), %rcx + sbbq 0x8(%rsi), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x10(%rsi), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x18(%rsi), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0xa0(%rsp) + adcq %r10, %rcx + movq %rcx, 0xa8(%rsp) + adcq $0x0, %r8 + movq %r8, 0xb0(%rsp) + adcq %rdx, %r9 + movq %r9, 0xb8(%rsp) + movq 0x20(%rsp), %rax + subq 0x20(%rsi), %rax + movq 0x28(%rsp), %rcx + sbbq 0x28(%rsi), %rcx + movq 0x30(%rsp), %r8 + sbbq 0x30(%rsi), %r8 + movq 0x38(%rsp), %r9 + sbbq 0x38(%rsi), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x20(%rsp) + adcq %r10, %rcx + movq %rcx, 0x28(%rsp) + adcq $0x0, %r8 + movq %r8, 0x30(%rsp) + adcq %rdx, %r9 + movq %r9, 0x38(%rsp) + movq 0xa0(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0xa8(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0xb8(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0xb0(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0xa8(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0xb8(%rsp), %rbx + movq 0xa8(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0xa8(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0xb0(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0xb8(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x20(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x28(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x38(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x30(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x38(%rsp), %rbx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x30(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x38(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq (%rsi), %rbx + movq 0x60(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x70(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x78(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x8(%rsi), %rbx + xorl %r13d, %r13d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rsi), %rbx + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rsi), %rbx + xorl %r8d, %r8d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x40(%rsp), %rbx + movq 0x60(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x70(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x78(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x48(%rsp), %rbx + xorl %r13d, %r13d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rsp), %rbx + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rsp), %rbx + xorl %r8d, %r8d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq (%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq $0x0, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + movq 0x40(%rsi), %rbx + movq 0xa0(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xb0(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0xb8(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x48(%rsi), %rbx + xorl %r13d, %r13d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rsi), %rbx + xorl %r15d, %r15d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rsi), %rbx + xorl %r8d, %r8d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq (%rsp), %rax + subq 0x40(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x48(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x50(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x58(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x80(%rsp), %rax + subq (%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x20(%rsi), %rbx + movq 0x60(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x70(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x78(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rsi), %rbx + xorl %r13d, %r13d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rsi), %rbx + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rsi), %rbx + xorl %r8d, %r8d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x80(%rsp), %rbx + movq 0x20(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x28(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x30(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x38(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x88(%rsp), %rbx + xorl %r13d, %r13d + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x28(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x30(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x38(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x90(%rsp), %rbx + xorl %r15d, %r15d + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x28(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x30(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x38(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x98(%rsp), %rbx + xorl %r8d, %r8d + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x28(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x30(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x38(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x80(%rsp), %rax + subq 0x60(%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x68(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x70(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x78(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x40(%rsi), %rax + movq 0x48(%rsi), %rdx + orq 0x50(%rsi), %rax + orq 0x58(%rsi), %rdx + orq %rdx, %rax + movq (%rsp), %r8 + movq 0x0(%rbp), %rax + cmoveq %rax, %r8 + movq 0x8(%rsp), %r9 + movq 0x8(%rbp), %rax + cmoveq %rax, %r9 + movq 0x10(%rsp), %r10 + movq 0x10(%rbp), %rax + cmoveq %rax, %r10 + movq 0x18(%rsp), %r11 + movq 0x18(%rbp), %rax + cmoveq %rax, %r11 + movq 0x80(%rsp), %r12 + movq 0x20(%rbp), %rax + cmoveq %rax, %r12 + movq 0x88(%rsp), %r13 + movq 0x28(%rbp), %rax + cmoveq %rax, %r13 + movq 0x90(%rsp), %r14 + movq 0x30(%rbp), %rax + cmoveq %rax, %r14 + movq 0x98(%rsp), %r15 + movq 0x38(%rbp), %rax + cmoveq %rax, %r15 + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + movq %r12, 0x20(%rdi) + movq %r13, 0x28(%rdi) + movq %r14, 0x30(%rdi) + movq %r15, 0x38(%rdi) + movq 0xa0(%rsp), %r8 + movq 0xa8(%rsp), %r9 + movq 0xb0(%rsp), %r10 + movq 0xb8(%rsp), %r11 + movl $0x1, %eax + cmoveq %rax, %r8 + movabsq $0xffffffff00000000, %rax + cmoveq %rax, %r9 + movq $0xffffffffffffffff, %rax + cmoveq %rax, %r10 + movl $0xfffffffe, %eax + cmoveq %rax, %r11 + movq %r8, 0x40(%rdi) + movq %r9, 0x48(%rdi) + movq %r10, 0x50(%rdi) + movq %r11, 0x58(%rdi) + addq $0xc0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_add_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_add_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_add_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_add_p384.S index 94293e4e703..4d7387f1809 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_add_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_add_p384.S @@ -41,6 +41,7 @@ S2N_BN_SYMBOL(bignum_add_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_bigendian_6.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_bigendian_6.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_bigendian_6.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_bigendian_6.S index 0a23e35659f..59d417b9cdf 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_bigendian_6.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_bigendian_6.S @@ -50,6 +50,7 @@ S2N_BN_SYMBOL(bignum_bigendian_6): S2N_BN_SYMBOL(bignum_frombebytes_6): S2N_BN_SYMBOL(bignum_tobebytes_6): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_cmul_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_cmul_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_cmul_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_cmul_p384.S index 76f67950872..1d9ff617379 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_cmul_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_cmul_p384.S @@ -45,6 +45,7 @@ S2N_BN_SYMBOL(bignum_cmul_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_cmul_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_cmul_p384_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_cmul_p384_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_cmul_p384_alt.S index 2e21e646150..f5b78addcec 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_cmul_p384_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_cmul_p384_alt.S @@ -49,6 +49,7 @@ #define qshort %ecx S2N_BN_SYMBOL(bignum_cmul_p384_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_deamont_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_deamont_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_deamont_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_deamont_p384.S index 9edb4ab6108..5b02af4252a 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_deamont_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_deamont_p384.S @@ -71,6 +71,7 @@ sbbq $0, d6 S2N_BN_SYMBOL(bignum_deamont_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_deamont_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_deamont_p384_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_deamont_p384_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_deamont_p384_alt.S index c0e6096bdd2..2c97a83a884 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_deamont_p384_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_deamont_p384_alt.S @@ -71,6 +71,7 @@ sbbq $0, d6 S2N_BN_SYMBOL(bignum_deamont_p384_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_demont_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_demont_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_demont_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_demont_p384.S index 36a5ef0078f..0ad5d43248a 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_demont_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_demont_p384.S @@ -63,6 +63,7 @@ sbbq $0, d6 S2N_BN_SYMBOL(bignum_demont_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_demont_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_demont_p384_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_demont_p384_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_demont_p384_alt.S index adccd962e70..dafc219c17d 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_demont_p384_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_demont_p384_alt.S @@ -63,6 +63,7 @@ sbbq $0, d6 S2N_BN_SYMBOL(bignum_demont_p384_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_double_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_double_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_double_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_double_p384.S index 7e0c35dab37..1afc75482d8 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_double_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_double_p384.S @@ -39,6 +39,7 @@ S2N_BN_SYMBOL(bignum_double_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_half_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_half_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_half_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_half_p384.S index a3e39541739..629dcedd5c9 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_half_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_half_p384.S @@ -36,6 +36,7 @@ S2N_BN_SYMBOL(bignum_half_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_inv_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_inv_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_inv_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_inv_p384.S index e1dfecfa2ea..2acba946089 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_inv_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_inv_p384.S @@ -1041,6 +1041,7 @@ leaq (%rax,%rdx), %r12 S2N_BN_SYMBOL(bignum_inv_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi @@ -1140,9 +1141,9 @@ S2N_BN_SYMBOL(bignum_inv_p384): movq $15, i movq $1, d - jmp midloop + jmp bignum_inv_p384_midloop -loop: +bignum_inv_p384_loop: // Separate out the matrix into sign-magnitude pairs @@ -1587,7 +1588,7 @@ loop: amontred(v) -midloop: +bignum_inv_p384_midloop: divstep59(d,ff,gg) movq %rsi, d @@ -1595,7 +1596,7 @@ midloop: // Next iteration decq i - jnz loop + jnz bignum_inv_p384_loop // The 15th and last iteration does not need anything except the // u value and the sign of f; the latter can be obtained from the diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_littleendian_6.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_littleendian_6.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_littleendian_6.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_littleendian_6.S index fe5744a86ef..d4110ef56e0 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_littleendian_6.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_littleendian_6.S @@ -42,6 +42,7 @@ S2N_BN_SYMBOL(bignum_littleendian_6): S2N_BN_SYMBOL(bignum_fromlebytes_6): S2N_BN_SYMBOL(bignum_tolebytes_6): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mod_n384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384.S similarity index 91% rename from third_party/s2n-bignum/x86_att/p384/bignum_mod_n384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384.S index 169a136ea32..4914f5a1769 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_mod_n384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384.S @@ -43,6 +43,7 @@ S2N_BN_SYMBOL(bignum_mod_n384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi @@ -62,7 +63,7 @@ S2N_BN_SYMBOL(bignum_mod_n384): // If the input is already <= 5 words long, go to a trivial "copy" path cmpq $6, k - jc shortinput + jc bignum_mod_n384_shortinput // Otherwise load the top 6 digits (top-down) and reduce k by 6 @@ -105,9 +106,9 @@ S2N_BN_SYMBOL(bignum_mod_n384): // Now do (k-6) iterations of 7->6 word modular reduction testq k, k - jz writeback + jz bignum_mod_n384_writeback -loop: +bignum_mod_n384_loop: // Compute q = min (m5 + 1) (2^64 - 1) @@ -170,11 +171,11 @@ loop: movq d, m0 decq k - jnz loop + jnz bignum_mod_n384_loop // Write back -writeback: +bignum_mod_n384_writeback: movq m0, (z) movq m1, 8(z) @@ -195,7 +196,7 @@ writeback: #endif ret -shortinput: +bignum_mod_n384_shortinput: xorq m0, m0 xorq m1, m1 @@ -205,21 +206,21 @@ shortinput: xorq m5, m5 testq k, k - jz writeback + jz bignum_mod_n384_writeback movq (%rdx), m0 decq k - jz writeback + jz bignum_mod_n384_writeback movq 8(%rdx), m1 decq k - jz writeback + jz bignum_mod_n384_writeback movq 16(%rdx), m2 decq k - jz writeback + jz bignum_mod_n384_writeback movq 24(%rdx), m3 decq k - jz writeback + jz bignum_mod_n384_writeback movq 32(%rdx), m4 - jmp writeback + jmp bignum_mod_n384_writeback #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mod_n384_6.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384_6.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_mod_n384_6.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384_6.S index 6b68c2a4445..2daf1fce8c3 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_mod_n384_6.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384_6.S @@ -40,6 +40,7 @@ S2N_BN_SYMBOL(bignum_mod_n384_6): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mod_n384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384_alt.S similarity index 90% rename from third_party/s2n-bignum/x86_att/p384/bignum_mod_n384_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384_alt.S index 92282a83a7e..f25bea3f779 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_mod_n384_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_n384_alt.S @@ -44,6 +44,7 @@ #define qshort %ebp S2N_BN_SYMBOL(bignum_mod_n384_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi @@ -64,7 +65,7 @@ S2N_BN_SYMBOL(bignum_mod_n384_alt): // If the input is already <= 5 words long, go to a trivial "copy" path cmpq $6, k - jc shortinput + jc bignum_mod_n384_alt_shortinput // Otherwise load the top 6 digits (top-down) and reduce k by 6 @@ -107,9 +108,9 @@ S2N_BN_SYMBOL(bignum_mod_n384_alt): // Now do (k-6) iterations of 7->6 word modular reduction testq k, k - jz writeback + jz bignum_mod_n384_alt_writeback -loop: +bignum_mod_n384_alt_loop: // Compute q = min (m5 + 1) (2^64 - 1) @@ -173,11 +174,11 @@ loop: movq d, m0 decq k - jnz loop + jnz bignum_mod_n384_alt_loop // Write back -writeback: +bignum_mod_n384_alt_writeback: movq m0, (z) movq m1, 8(z) @@ -199,7 +200,7 @@ writeback: #endif ret -shortinput: +bignum_mod_n384_alt_shortinput: xorq m0, m0 xorq m1, m1 @@ -209,21 +210,21 @@ shortinput: xorq m5, m5 testq k, k - jz writeback + jz bignum_mod_n384_alt_writeback movq (%rdx), m0 decq k - jz writeback + jz bignum_mod_n384_alt_writeback movq 8(%rdx), m1 decq k - jz writeback + jz bignum_mod_n384_alt_writeback movq 16(%rdx), m2 decq k - jz writeback + jz bignum_mod_n384_alt_writeback movq 24(%rdx), m3 decq k - jz writeback + jz bignum_mod_n384_alt_writeback movq 32(%rdx), m4 - jmp writeback + jmp bignum_mod_n384_alt_writeback #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mod_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384.S similarity index 91% rename from third_party/s2n-bignum/x86_att/p384/bignum_mod_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384.S index c9caf41c83d..69418ecd5bf 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_mod_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384.S @@ -42,6 +42,7 @@ S2N_BN_SYMBOL(bignum_mod_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi @@ -61,7 +62,7 @@ S2N_BN_SYMBOL(bignum_mod_p384): // If the input is already <= 5 words long, go to a trivial "copy" path cmpq $6, k - jc shortinput + jc bignum_mod_p384_shortinput // Otherwise load the top 6 digits (top-down) and reduce k by 6 @@ -104,9 +105,9 @@ S2N_BN_SYMBOL(bignum_mod_p384): // Now do (k-6) iterations of 7->6 word modular reduction testq k, k - jz writeback + jz bignum_mod_p384_writeback -loop: +bignum_mod_p384_loop: // Compute q = min (m5 + 1) (2^64 - 1) @@ -169,11 +170,11 @@ loop: movq d, m0 decq k - jnz loop + jnz bignum_mod_p384_loop // Write back -writeback: +bignum_mod_p384_writeback: movq m0, (z) movq m1, 8(z) @@ -194,7 +195,7 @@ writeback: #endif ret -shortinput: +bignum_mod_p384_shortinput: xorq m0, m0 xorq m1, m1 @@ -204,21 +205,21 @@ shortinput: xorq m5, m5 testq k, k - jz writeback + jz bignum_mod_p384_writeback movq (%rdx), m0 decq k - jz writeback + jz bignum_mod_p384_writeback movq 8(%rdx), m1 decq k - jz writeback + jz bignum_mod_p384_writeback movq 16(%rdx), m2 decq k - jz writeback + jz bignum_mod_p384_writeback movq 24(%rdx), m3 decq k - jz writeback + jz bignum_mod_p384_writeback movq 32(%rdx), m4 - jmp writeback + jmp bignum_mod_p384_writeback #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mod_p384_6.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384_6.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_mod_p384_6.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384_6.S index 7196a76f314..2c27d82ccc4 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_mod_p384_6.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384_6.S @@ -39,6 +39,7 @@ S2N_BN_SYMBOL(bignum_mod_p384_6): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mod_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384_alt.S similarity index 90% rename from third_party/s2n-bignum/x86_att/p384/bignum_mod_p384_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384_alt.S index 79da7842a62..16f54defb30 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_mod_p384_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mod_p384_alt.S @@ -46,6 +46,7 @@ S2N_BN_SYMBOL(bignum_mod_p384_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi @@ -65,7 +66,7 @@ S2N_BN_SYMBOL(bignum_mod_p384_alt): // If the input is already <= 5 words long, go to a trivial "copy" path cmpq $6, k - jc shortinput + jc bignum_mod_p384_alt_shortinput // Otherwise load the top 6 digits (top-down) and reduce k by 6 @@ -108,9 +109,9 @@ S2N_BN_SYMBOL(bignum_mod_p384_alt): // Now do (k-6) iterations of 7->6 word modular reduction testq k, k - jz writeback + jz bignum_mod_p384_alt_writeback -loop: +bignum_mod_p384_alt_loop: // Compute q = min (m5 + 1) (2^64 - 1) @@ -173,11 +174,11 @@ loop: movq d, m0 decq k - jnz loop + jnz bignum_mod_p384_alt_loop // Write back -writeback: +bignum_mod_p384_alt_writeback: movq m0, (z) movq m1, 8(z) @@ -198,7 +199,7 @@ writeback: #endif ret -shortinput: +bignum_mod_p384_alt_shortinput: xorq m0, m0 xorq m1, m1 @@ -208,21 +209,21 @@ shortinput: xorq m5, m5 testq k, k - jz writeback + jz bignum_mod_p384_alt_writeback movq (%rdx), m0 decq k - jz writeback + jz bignum_mod_p384_alt_writeback movq 8(%rdx), m1 decq k - jz writeback + jz bignum_mod_p384_alt_writeback movq 16(%rdx), m2 decq k - jz writeback + jz bignum_mod_p384_alt_writeback movq 24(%rdx), m3 decq k - jz writeback + jz bignum_mod_p384_alt_writeback movq 32(%rdx), m4 - jmp writeback + jmp bignum_mod_p384_alt_writeback #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_montinv_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montinv_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_montinv_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montinv_p384.S index 81928ed59dc..b85c917793e 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_montinv_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montinv_p384.S @@ -1046,6 +1046,7 @@ leaq (%rax,%rdx), %r12 S2N_BN_SYMBOL(bignum_montinv_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_montmul_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montmul_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_montmul_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montmul_p384.S index 105efac6109..b11d91efd1c 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_montmul_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montmul_p384.S @@ -86,6 +86,7 @@ adcq $0, d7 S2N_BN_SYMBOL(bignum_montmul_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_montmul_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montmul_p384_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_montmul_p384_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montmul_p384_alt.S index 5a8b4905d92..4d8b14a2a5b 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_montmul_p384_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montmul_p384_alt.S @@ -108,6 +108,7 @@ adcq $0, d7 S2N_BN_SYMBOL(bignum_montmul_p384_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_montsqr_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montsqr_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_montsqr_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montsqr_p384.S index 0d0b36013ab..b71edd98017 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_montsqr_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montsqr_p384.S @@ -83,6 +83,7 @@ sbbq $0, d6 S2N_BN_SYMBOL(bignum_montsqr_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_montsqr_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montsqr_p384_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_montsqr_p384_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montsqr_p384_alt.S index 061ef6181d3..e00a162ae77 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_montsqr_p384_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_montsqr_p384_alt.S @@ -105,6 +105,7 @@ sbbq $0, d6 S2N_BN_SYMBOL(bignum_montsqr_p384_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_mux_6.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mux_6.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_mux_6.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mux_6.S index cb4c2ca503c..e4890132381 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_mux_6.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_mux_6.S @@ -31,6 +31,7 @@ S2N_BN_SYMBOL(bignum_mux_6): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_neg_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_neg_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_neg_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_neg_p384.S index 746c01286a2..edefc39c899 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_neg_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_neg_p384.S @@ -31,6 +31,7 @@ #define n0short %eax S2N_BN_SYMBOL(bignum_neg_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_nonzero_6.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_nonzero_6.S similarity index 98% rename from third_party/s2n-bignum/x86_att/p384/bignum_nonzero_6.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_nonzero_6.S index 7fdb6bab060..c55511452e1 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_nonzero_6.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_nonzero_6.S @@ -26,6 +26,7 @@ S2N_BN_SYMBOL(bignum_nonzero_6): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_optneg_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_optneg_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_optneg_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_optneg_p384.S index 0a8b247e5dc..af11ff1c1b1 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_optneg_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_optneg_p384.S @@ -34,6 +34,7 @@ #define n0short %eax S2N_BN_SYMBOL(bignum_optneg_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_sub_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_sub_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_sub_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_sub_p384.S index 5914f4ae9cf..c4f617386bf 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_sub_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_sub_p384.S @@ -40,6 +40,7 @@ S2N_BN_SYMBOL(bignum_sub_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_tomont_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_tomont_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_tomont_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_tomont_p384.S index 66503a2ec40..326bf327e16 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_tomont_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_tomont_p384.S @@ -85,6 +85,7 @@ adcq $0, d7 S2N_BN_SYMBOL(bignum_tomont_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_tomont_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_tomont_p384_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_tomont_p384_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_tomont_p384_alt.S index 725713d3410..3aaa2a18cdc 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_tomont_p384_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_tomont_p384_alt.S @@ -103,6 +103,7 @@ adcq $0, d7 S2N_BN_SYMBOL(bignum_tomont_p384_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_triple_p384.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_triple_p384.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_triple_p384.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_triple_p384.S index 52b70f6bea4..417bf465342 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_triple_p384.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_triple_p384.S @@ -40,6 +40,7 @@ #define qshort %edx S2N_BN_SYMBOL(bignum_triple_p384): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/bignum_triple_p384_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_triple_p384_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/bignum_triple_p384_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_triple_p384_alt.S index bdbf7e8f6d7..a48c8848890 100644 --- a/third_party/s2n-bignum/x86_att/p384/bignum_triple_p384_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/bignum_triple_p384_alt.S @@ -43,6 +43,7 @@ #define dshort %edx S2N_BN_SYMBOL(bignum_triple_p384_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjadd.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/p384_montjadd.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjadd.S index 60780822043..23e12ed5e3b 100644 --- a/third_party/s2n-bignum/x86_att/p384/p384_montjadd.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjadd.S @@ -893,6 +893,7 @@ cmovnbe 40+P2, r5 S2N_BN_SYMBOL(p384_montjadd): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjadd_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/p384_montjadd_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjadd_alt.S index e36a60f331a..d16b163d338 100644 --- a/third_party/s2n-bignum/x86_att/p384/p384_montjadd_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjadd_alt.S @@ -818,6 +818,7 @@ cmovnbe 40+P2, r5 S2N_BN_SYMBOL(p384_montjadd_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjdouble.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/p384_montjdouble.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjdouble.S index b51d24f9317..3be512a1338 100644 --- a/third_party/s2n-bignum/x86_att/p384/p384_montjdouble.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjdouble.S @@ -904,6 +904,7 @@ movq %r13, 0x28+P0 S2N_BN_SYMBOL(p384_montjdouble): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjdouble_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/p384_montjdouble_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjdouble_alt.S index 8258e352674..5a2e397ac58 100644 --- a/third_party/s2n-bignum/x86_att/p384/p384_montjdouble_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjdouble_alt.S @@ -1098,6 +1098,7 @@ movq %r13, 0x28+P0 S2N_BN_SYMBOL(p384_montjdouble_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjmixadd.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/p384_montjmixadd.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjmixadd.S index 539a28117a8..284c4577190 100644 --- a/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjmixadd.S @@ -886,6 +886,7 @@ movq r5, 40+P S2N_BN_SYMBOL(p384_montjmixadd): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjmixadd_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/p384_montjmixadd_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjmixadd_alt.S index da610ee88eb..df8a7533297 100644 --- a/third_party/s2n-bignum/x86_att/p384/p384_montjmixadd_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjmixadd_alt.S @@ -811,6 +811,7 @@ movq r5, 40+P S2N_BN_SYMBOL(p384_montjmixadd_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjscalarmul.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/p384_montjscalarmul.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjscalarmul.S index 38bea41d878..1b1445e75ec 100644 --- a/third_party/s2n-bignum/x86_att/p384/p384_montjscalarmul.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjscalarmul.S @@ -87,6 +87,7 @@ cmovzq TAB+JACSIZE*(I-1)+88(%rsp), %r9 S2N_BN_SYMBOL(p384_montjscalarmul): + _CET_ENDBR // The Windows version literally calls the standard ABI version. // This simplifies the proofs since subroutine offsets are fixed. diff --git a/third_party/s2n-bignum/x86_att/p384/p384_montjscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjscalarmul_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p384/p384_montjscalarmul_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjscalarmul_alt.S index c666db6dbe9..07fd39d4b1d 100644 --- a/third_party/s2n-bignum/x86_att/p384/p384_montjscalarmul_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p384/p384_montjscalarmul_alt.S @@ -87,6 +87,7 @@ cmovzq TAB+JACSIZE*(I-1)+88(%rsp), %r9 S2N_BN_SYMBOL(p384_montjscalarmul_alt): + _CET_ENDBR // The Windows version literally calls the standard ABI version. // This simplifies the proofs since subroutine offsets are fixed. diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_add_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_add_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_add_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_add_p521.S index b046828d458..430c382a628 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_add_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_add_p521.S @@ -40,6 +40,7 @@ S2N_BN_SYMBOL(bignum_add_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_cmul_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_cmul_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_cmul_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_cmul_p521.S index fbfc3063fd4..97695288c7d 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_cmul_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_cmul_p521.S @@ -52,6 +52,7 @@ #define h d9 S2N_BN_SYMBOL(bignum_cmul_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_cmul_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_cmul_p521_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_cmul_p521_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_cmul_p521_alt.S index fd6986f232a..794193ef97e 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_cmul_p521_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_cmul_p521_alt.S @@ -56,6 +56,7 @@ #define h d9 S2N_BN_SYMBOL(bignum_cmul_p521_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_deamont_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_deamont_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_deamont_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_deamont_p521.S index 099c0e33fcf..f3ebc44c6a5 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_deamont_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_deamont_p521.S @@ -40,6 +40,7 @@ #define d8 %rbp S2N_BN_SYMBOL(bignum_deamont_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_demont_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_demont_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_demont_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_demont_p521.S index ef83448b156..8796752852d 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_demont_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_demont_p521.S @@ -40,6 +40,7 @@ #define d8 %rdx S2N_BN_SYMBOL(bignum_demont_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_double_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_double_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_double_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_double_p521.S index 9322ec0b1a5..d5f091669e7 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_double_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_double_p521.S @@ -28,6 +28,7 @@ S2N_BN_SYMBOL(bignum_double_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_fromlebytes_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_fromlebytes_p521.S similarity index 98% rename from third_party/s2n-bignum/x86_att/p521/bignum_fromlebytes_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_fromlebytes_p521.S index 6a80dce3c22..907de58755d 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_fromlebytes_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_fromlebytes_p521.S @@ -28,6 +28,7 @@ #define a %rax S2N_BN_SYMBOL(bignum_fromlebytes_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_half_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_half_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_half_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_half_p521.S index ee8b91a325a..c974d995679 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_half_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_half_p521.S @@ -39,6 +39,7 @@ S2N_BN_SYMBOL(bignum_half_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_inv_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_inv_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_inv_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_inv_p521.S index a23dbc56b43..51eb6edd9a4 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_inv_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_inv_p521.S @@ -966,6 +966,7 @@ leaq (%rax,%rdx), %r12 S2N_BN_SYMBOL(bignum_inv_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_mod_n521_9.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_n521_9.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_mod_n521_9.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_n521_9.S index c7e33f88fd3..9407283a86d 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_mod_n521_9.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_n521_9.S @@ -40,6 +40,7 @@ #define qshort %edx S2N_BN_SYMBOL(bignum_mod_n521_9): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_mod_n521_9_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_n521_9_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_mod_n521_9_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_n521_9_alt.S index aeb314691ab..0ecaa4595f6 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_mod_n521_9_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_n521_9_alt.S @@ -40,6 +40,7 @@ #define qshort %edx S2N_BN_SYMBOL(bignum_mod_n521_9_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_mod_p521_9.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_p521_9.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_mod_p521_9.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_p521_9.S index 0d67aa3ee26..7de12dd98bd 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_mod_p521_9.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mod_p521_9.S @@ -39,6 +39,7 @@ #define d7 %rsi S2N_BN_SYMBOL(bignum_mod_p521_9): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_montmul_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montmul_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_montmul_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montmul_p521.S index 21d777a655c..4c19463d27d 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_montmul_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montmul_p521.S @@ -41,6 +41,7 @@ adoxq %rbx, high S2N_BN_SYMBOL(bignum_montmul_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_montmul_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montmul_p521_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_montmul_p521_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montmul_p521_alt.S index b3d0d7c2c67..eea6ac1ce72 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_montmul_p521_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montmul_p521_alt.S @@ -58,6 +58,7 @@ adcq %rdx, h S2N_BN_SYMBOL(bignum_montmul_p521_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_montsqr_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montsqr_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_montsqr_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montsqr_p521.S index ede53c627cf..d9b26f3ec92 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_montsqr_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montsqr_p521.S @@ -52,6 +52,7 @@ adoxq zero, high S2N_BN_SYMBOL(bignum_montsqr_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_montsqr_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montsqr_p521_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_montsqr_p521_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montsqr_p521_alt.S index dccdc33ef5d..f20a99698f7 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_montsqr_p521_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_montsqr_p521_alt.S @@ -87,6 +87,7 @@ adcq $0, c S2N_BN_SYMBOL(bignum_montsqr_p521_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_mul_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mul_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_mul_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mul_p521.S index f96e8417ab8..19680ee6f42 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_mul_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mul_p521.S @@ -36,6 +36,7 @@ adoxq %rbx, high S2N_BN_SYMBOL(bignum_mul_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_mul_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mul_p521_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_mul_p521_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mul_p521_alt.S index a769fa0b3a8..e4488eae349 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_mul_p521_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_mul_p521_alt.S @@ -53,6 +53,7 @@ adcq %rdx, h S2N_BN_SYMBOL(bignum_mul_p521_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_neg_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_neg_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_neg_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_neg_p521.S index 9a130b0b304..9128da8a0a9 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_neg_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_neg_p521.S @@ -30,6 +30,7 @@ #define d5 %r11 S2N_BN_SYMBOL(bignum_neg_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_optneg_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_optneg_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_optneg_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_optneg_p521.S index 8f4c740b6bc..95661fe0c24 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_optneg_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_optneg_p521.S @@ -32,6 +32,7 @@ #define d4 %r11 S2N_BN_SYMBOL(bignum_optneg_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_sqr_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sqr_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_sqr_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sqr_p521.S index 4b4748f1064..e300799b9c3 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_sqr_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sqr_p521.S @@ -46,6 +46,7 @@ adoxq zero, high S2N_BN_SYMBOL(bignum_sqr_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_sqr_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sqr_p521_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_sqr_p521_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sqr_p521_alt.S index 475d3d3c812..4ae40c1e5a8 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_sqr_p521_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sqr_p521_alt.S @@ -81,6 +81,7 @@ adcq $0, c S2N_BN_SYMBOL(bignum_sqr_p521_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_sub_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sub_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_sub_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sub_p521.S index 03db019833e..c4ea3d31509 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_sub_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_sub_p521.S @@ -39,6 +39,7 @@ S2N_BN_SYMBOL(bignum_sub_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_tolebytes_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_tolebytes_p521.S similarity index 98% rename from third_party/s2n-bignum/x86_att/p521/bignum_tolebytes_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_tolebytes_p521.S index 7f891725690..077dcc51fa8 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_tolebytes_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_tolebytes_p521.S @@ -28,6 +28,7 @@ #define a %rax S2N_BN_SYMBOL(bignum_tolebytes_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_tomont_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_tomont_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_tomont_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_tomont_p521.S index 39983c24bae..c5c678a0202 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_tomont_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_tomont_p521.S @@ -39,6 +39,7 @@ #define d7 %rsi S2N_BN_SYMBOL(bignum_tomont_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_triple_p521.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_triple_p521.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_triple_p521.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_triple_p521.S index 264481ef181..aad7a2da098 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_triple_p521.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_triple_p521.S @@ -40,6 +40,7 @@ S2N_BN_SYMBOL(bignum_triple_p521): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_triple_p521_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_triple_p521_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/bignum_triple_p521_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_triple_p521_alt.S index ecd07987788..a12c9b1c5c8 100644 --- a/third_party/s2n-bignum/x86_att/p521/bignum_triple_p521_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/bignum_triple_p521_alt.S @@ -41,6 +41,7 @@ #define d %rdx S2N_BN_SYMBOL(bignum_triple_p521_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jadd.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/p521_jadd.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jadd.S index 9f1b03c47bd..36a856647ce 100644 --- a/third_party/s2n-bignum/x86_att/p521/p521_jadd.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jadd.S @@ -747,6 +747,7 @@ movq %rax, 64+P0 S2N_BN_SYMBOL(p521_jadd): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jadd_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/p521_jadd_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jadd_alt.S index 5b51a4f6a62..75571f21fbf 100644 --- a/third_party/s2n-bignum/x86_att/p521/p521_jadd_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jadd_alt.S @@ -1023,6 +1023,7 @@ movq %rax, 64+P0 S2N_BN_SYMBOL(p521_jadd_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jdouble.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/p521_jdouble.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jdouble.S index 22ccbebd433..e0e40bfcc35 100644 --- a/third_party/s2n-bignum/x86_att/p521/p521_jdouble.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jdouble.S @@ -1286,6 +1286,7 @@ movq %rbx, 64+P0 S2N_BN_SYMBOL(p521_jdouble): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jdouble_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/p521_jdouble_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jdouble_alt.S index 2dc6c321201..a420a2ce0ab 100644 --- a/third_party/s2n-bignum/x86_att/p521/p521_jdouble_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jdouble_alt.S @@ -1775,6 +1775,7 @@ movq %rbx, 64+P0 S2N_BN_SYMBOL(p521_jdouble_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jmixadd.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/p521_jmixadd.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jmixadd.S index 879fce6954f..fd9111224b9 100644 --- a/third_party/s2n-bignum/x86_att/p521/p521_jmixadd.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jmixadd.S @@ -776,6 +776,7 @@ movq %rax, 64+P0 S2N_BN_SYMBOL(p521_jmixadd): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jmixadd_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/p521_jmixadd_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jmixadd_alt.S index d9279fe3054..9da04d0588e 100644 --- a/third_party/s2n-bignum/x86_att/p521/p521_jmixadd_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jmixadd_alt.S @@ -1052,6 +1052,7 @@ movq %rax, 64+P0 S2N_BN_SYMBOL(p521_jmixadd_alt): + _CET_ENDBR #if WINDOWS_ABI pushq %rdi diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jscalarmul.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/p521_jscalarmul.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jscalarmul.S index 905c32a76d3..d0f73f8b423 100644 --- a/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jscalarmul.S @@ -73,6 +73,7 @@ cmovzq TAB+JACSIZE*(I-1)+64+C*NUMSIZE(%rsp), %r12 S2N_BN_SYMBOL(p521_jscalarmul): + _CET_ENDBR // The Windows version literally calls the standard ABI version. // This simplifies the proofs since subroutine offsets are fixed. diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jscalarmul_alt.S similarity index 99% rename from third_party/s2n-bignum/x86_att/p521/p521_jscalarmul_alt.S rename to third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jscalarmul_alt.S index ee0fca779b4..cc2a9d83319 100644 --- a/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul_alt.S +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/p521/p521_jscalarmul_alt.S @@ -73,6 +73,7 @@ cmovzq TAB+JACSIZE*(I-1)+64+C*NUMSIZE(%rsp), %r12 S2N_BN_SYMBOL(p521_jscalarmul_alt): + _CET_ENDBR // The Windows version literally calls the standard ABI version. // This simplifies the proofs since subroutine offsets are fixed. diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_add_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_add_p256k1.S new file mode 100644 index 00000000000..c78ad8ffb00 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_add_p256k1.S @@ -0,0 +1,101 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Add modulo p_256k1, z := (x + y) mod p_256k1, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_add_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_p256k1) + .text + +#define z %rdi +#define x %rsi +#define y %rdx + +#define d0 %rcx +#define d1 %r8 +#define d2 %r9 +#define d3 %r10 + +#define dd %rax + +// These two re-use inputs x and y when safe to do so + +#define l %rsi +#define c %rdx + +S2N_BN_SYMBOL(bignum_add_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Load and add the two inputs as 2^256 * (-c) + [d3;d2;d1;d0] = x + y + + movq (x), d0 + addq (y), d0 + movq 8(x), d1 + adcq 8(y), d1 + movq 16(x), d2 + adcq 16(y), d2 + movq 24(x), d3 + adcq 24(y), d3 + sbbq c, c + +// Create dd = d3 AND d2 AND d1 to condense the later comparison +// We hope this will interleave with the addition, though we can't +// express that directly as the AND operation destroys the carry flag. + + movq d1, dd + andq d2, dd + andq d3, dd + +// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256. +// For the lowest word use d0 + 4294968273 >= 2^64 <=> ~4294968273 < d0 + + movq $~4294968273, l + cmpq d0, l + adcq $0, dd + sbbq $0, c + +// Now c <> 0 <=> z >= p_256k1, so mask the constant l accordingly + + notq l + cmovzq c, l + +// If z >= p_256k1 do z := z - p_256k1, i.e. add l in 4 digits + + addq l, d0 + movq d0, (z) + adcq $0, d1 + movq d1, 8(z) + adcq $0, d2 + movq d2, 16(z) + adcq $0, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_cmul_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_cmul_p256k1.S new file mode 100644 index 00000000000..cab05713000 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_cmul_p256k1.S @@ -0,0 +1,107 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming +// x reduced +// Inputs c, x[4]; output z[4] +// +// extern void bignum_cmul_p256k1 +// (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = c, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256k1) + .text + +#define z %rdi + +// Temporarily moved here for initial multiply + +#define x %rcx +#define c %rcx + +// Likewise this is thrown away after initial multiply + +#define d %rdx +#define h %rdx + +#define a %rax +#define ashort %eax +#define q %rax + +#define d0 %rsi +#define d1 %r8 +#define d2 %r9 +#define d3 %r10 + +S2N_BN_SYMBOL(bignum_cmul_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Shuffle inputs (since we want multiplier in %rdx) + + movq %rdx, x + movq %rsi, d + +// Multiply, accumulating the result as 2^256 * h + [d3;d2;d1;d0] + + mulxq (x), d0, d1 + mulxq 8(x), a, d2 + addq a, d1 + mulxq 16(x), a, d3 + adcq a, d2 + mulxq 24(x), a, h + adcq a, d3 + adcq $0, h + +// Now the quotient estimate is q = h + 1, and then we do the reduction, +// writing z = [d3;d2;d1;d0], as z' = (2^256 * h + z) - q * p_256k1 = +// (2^256 * h + z) - q * (2^256 - 4294968273) = -2^256 + (z + 4294968273 * q) + + leaq 1(h), q + movq $4294968273, c + mulq c + + addq %rax, d0 + adcq %rdx, d1 + adcq $0, d2 + adcq $0, d3 + +// Because of the implicit -2^256, CF means >= 0 so z' is the answer; ~CF +// means z' < 0 so we add p_256k1, which in 4 digits means subtracting c. + + movq $0, a + cmovcq a, c + + subq c, d0 + movq d0, (z) + sbbq a, d1 + movq d1, 8(z) + sbbq a, d2 + movq d2, 16(z) + sbbq a, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_cmul_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_cmul_p256k1_alt.S new file mode 100644 index 00000000000..3c9e95340ca --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_cmul_p256k1_alt.S @@ -0,0 +1,119 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word modulo p_256k1, z := (c * x) mod p_256k1, assuming +// x reduced +// Inputs c, x[4]; output z[4] +// +// extern void bignum_cmul_p256k1_alt +// (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = c, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_p256k1_alt) + .text + +#define z %rdi + +// Temporarily moved here for initial multiply + +#define x %rcx +#define c %rsi + +// Likewise this is thrown away after initial multiply + +#define d %rdx +#define h %rdx + +#define a %rax +#define ashort %eax +#define q %rax + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %rcx + +S2N_BN_SYMBOL(bignum_cmul_p256k1_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Shuffle inputs (since we want %rdx for the high parts of products) + + movq %rdx, x + +// Multiply, accumulating the result as 2^256 * h + [d3;d2;d1;d0] + + movq (x), a + mulq c + movq a, d0 + movq d, d1 + + movq 8(x), a + xorq d2, d2 + mulq c + addq a, d1 + adcq d, d2 + + movq 16(x), a + mulq c + addq a, d2 + adcq $0, d + + movq 24(x), a + movq d, d3 + mulq c + addq a, d3 + adcq $0, h + +// Now the quotient estimate is q = h + 1, and then we do the reduction, +// writing z = [d3;d2;d1;d0], as z' = (2^256 * h + z) - q * p_256k1 = +// (2^256 * h + z) - q * (2^256 - 4294968273) = -2^256 + (z + 4294968273 * q) + + leaq 1(h), q + movq $4294968273, c + mulq c + + addq %rax, d0 + adcq %rdx, d1 + adcq $0, d2 + adcq $0, d3 + +// Because of the implicit -2^256, CF means >= 0 so z' is the answer; ~CF +// means z' < 0 so we add p_256k1, which in 4 digits means subtracting c. + + movq $0, a + cmovcq a, c + + subq c, d0 + movq d0, (z) + sbbq a, d1 + movq d1, 8(z) + sbbq a, d2 + movq d2, 16(z) + sbbq a, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_deamont_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_deamont_p256k1.S new file mode 100644 index 00000000000..46071c3b081 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_deamont_p256k1.S @@ -0,0 +1,147 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from Montgomery form z := (x / 2^256) mod p_256k1, +// Input x[4]; output z[4] +// +// extern void bignum_deamont_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form, +// "almost" meaning any 4-digit input will work, with no range restriction. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_p256k1) + .text + +#define z %rdi +#define x %rsi + +// Re-use x variable for the negated multiplicative inverse of p_256k1 + +#define w %rsi + +// The rotating registers for the 4 digits + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +// Other variables. We need d == %rdx for mulx instructions + +#define a %rax +#define d %rdx +#define c %rcx + +S2N_BN_SYMBOL(bignum_deamont_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Set up an initial 4-word window [d3,d2,d1,d0] = x + + movq (x), d0 + movq 8(x), d1 + movq 16(x), d2 + movq 24(x), d3 + +// Set w to negated multiplicative inverse p_256k1 * w == -1 (mod 2^64). + + movq $0xd838091dd2253531, w + +// Four stages of Montgomery reduction, rotating the register window. +// Use c as a carry-catcher since the imul destroys the flags in general. + + imulq w, d0 + movq $4294968273, a + mulq d0 + subq d, d1 + sbbq c, c + + imulq w, d1 + movq $4294968273, a + mulq d1 + negq c + sbbq d, d2 + sbbq c, c + + imulq w, d2 + movq $4294968273, a + mulq d2 + negq c + sbbq d, d3 + sbbq c, c + + imulq w, d3 + movq $4294968273, a + mulq d3 + +// Take an AND of the four cofactor digits, re-using the w variable. +// We hope this will interleave nicely with the computation sequence +// above but don't want to use other registers explicitly, so put +// it all together in a block. + + movq d0, w + andq d1, w + andq d2, w + andq d3, w + +// Finish propagating carry through new top part + + xorq a, a + negq c + sbbq d, d0 + sbbq a, d1 + sbbq a, d2 + sbbq a, d3 + +// The result thus far is z = (x + q * p_256k1) / 2^256. Note that +// z < p_256k1 <=> x < (2^256 - q) * p_256k1, and since +// x < 2^256 < 2 * p_256k1, we have that *if* q < 2^256 - 1 then +// z < p_256k1. Conversely if q = 2^256 - 1 then since +// x + q * p_256k1 == 0 (mod 2^256) we have x == p_256k1 (mod 2^256) +// and thus x = p_256k1, and z >= p_256k1 (in fact z = p_256k1). +// So in summary z < p_256k1 <=> ~(q = 2^256 - 1) <=> ~(x = p_256k1). +// and hence iff q is all 1s, or equivalently dd is all 1s, we +// correct by subtracting p_256k1 to get 0. Since this is only one +// case we compute the result more explicitly rather than doing +// arithmetic with carry propagation. + + movq $4294968273, d + addq d0, d + addq $1, w + cmovzq d, d0 + cmovzq a, d1 + cmovzq a, d2 + cmovzq a, d3 + +// write back and return + + movq d0, (z) + movq d1, 8(z) + movq d2, 16(z) + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_demont_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_demont_p256k1.S new file mode 100644 index 00000000000..2edc6be8d66 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_demont_p256k1.S @@ -0,0 +1,114 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from Montgomery form z := (x / 2^256) mod p_256k1, +// assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_demont_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// This assumes the input is < p_256k1 for correctness. If this is not the +// case, use the variant "bignum_deamont_p256k1" instead. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_p256k1) + .text + +#define z %rdi +#define x %rsi + +// Re-use x variable for the negated multiplicative inverse of p_256k1 + +#define w %rsi + +// The rotating registers for the 4 digits + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +// Other variables. We need d == %rdx for mulx instructions + +#define a %rax +#define d %rdx +#define c %rcx + +S2N_BN_SYMBOL(bignum_demont_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Set up an initial 4-word window [d3,d2,d1,d0] = x + + movq (x), d0 + movq 8(x), d1 + movq 16(x), d2 + movq 24(x), d3 + +// Set w to negated multiplicative inverse p_256k1 * w == -1 (mod 2^64). + + movq $0xd838091dd2253531, w + +// Four stages of Montgomery reduction, rotating the register window. +// Use c as a carry-catcher since the imul destroys the flags in general. + + imulq w, d0 + movq $4294968273, a + mulq d0 + subq d, d1 + sbbq c, c + + imulq w, d1 + movq $4294968273, a + mulq d1 + negq c + sbbq d, d2 + sbbq c, c + + imulq w, d2 + movq $4294968273, a + mulq d2 + negq c + sbbq d, d3 + sbbq c, c + + imulq w, d3 + movq $4294968273, a + mulq d3 + negq c + sbbq d, d0 + +// Finish propagating carry through new top part, write back and return + + movq d0, (z) + sbbq $0, d1 + movq d1, 8(z) + sbbq $0, d2 + movq d2, 16(z) + sbbq $0, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_double_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_double_p256k1.S new file mode 100644 index 00000000000..fa34aeff914 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_double_p256k1.S @@ -0,0 +1,96 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Double modulo p_256k1, z := (2 * x) mod p_256k1, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_double_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_p256k1) + .text + +#define z %rdi +#define x %rsi + +#define d0 %rcx +#define d1 %r8 +#define d2 %r9 +#define d3 %r10 + +#define dd %rax +#define c %rdx + +// Re-uses the input x when safe to do so + +#define l %rsi + +S2N_BN_SYMBOL(bignum_double_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the inputs and double top-down as z = 2^256 * c + [d3;d2;d1;d0] +// While doing this, create an AND dd of [d3;d2;d1] to condense comparison + + movq 24(x), d3 + movq d3, c + movq 16(x), d2 + shrq $63, c + shldq $1, d2, d3 + movq d3, dd + movq 8(x), d1 + shldq $1, d1, d2 + andq d2, dd + movq (x), d0 + shldq $1, d0, d1 + andq d1, dd + shlq $1, d0 + +// Decide whether z >= p_256k1 <=> z + 4294968273 >= 2^256. +// For the lowest word use d0 + 4294968273 >= 2^64 <=> ~4294968273 < d0 + + movq $~4294968273, l + cmpq d0, l + adcq $0, dd + adcq $0, c + +// Now c <> 0 <=> z >= p_256k1, so mask the constant l accordingly + + notq l + cmovzq c, l + +// If z >= p_256k1 do z := z - p_256k1, i.e. add l in 4 digits + + addq l, d0 + movq d0, (z) + adcq $0, d1 + movq d1, 8(z) + adcq $0, d2 + movq d2, 16(z) + adcq $0, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_half_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_half_p256k1.S new file mode 100644 index 00000000000..da8317a9e42 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_half_p256k1.S @@ -0,0 +1,86 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Halve modulo p_256k1, z := (x / 2) mod p_256k1, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_half_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_half_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_half_p256k1) + .text + +#define z %rdi +#define x %rsi + +#define d0 %rcx +#define d1 %rdx +#define d2 %r8 +#define d3 %r9 + +#define c %rax + +S2N_BN_SYMBOL(bignum_half_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the 4 digits of x, and letting b be the LSB (whether it's odd) +// construct the constant c = 4294968273 * b + + movq (x), d0 + movq $4294968273, c + movq 8(x), d1 + movq $1, d3 + andq d0, d3 + movq 16(x), d2 + cmovzq d3, c + movq 24(x), d3 + +// We want (x + b * p_256k1) / 2 where b is that LSB, in {0,1}. +// That amounts to (2^256 * b + x - 4294968273 * b) / 2, and +// modulo 4 words that's the same as ([2^256 * c + x] - c) / 2. +// So do that subtraction and shift a place right as we go. + + subq c, d0 + sbbq $0, d1 + sbbq $0, d2 + sbbq $0, d3 + sbbq $0, c + +// Shift right, pushing the carry back down, and store back + + shrdq $1, d1, d0 + movq d0, (z) + shrdq $1, d2, d1 + movq d1, 8(z) + shrdq $1, d3, d2 + movq d2, 16(z) + shrdq $1, c, d3 + movq d3, 24(z) + +// Return + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mod_n256k1_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mod_n256k1_4.S new file mode 100644 index 00000000000..79fb4d9adc3 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mod_n256k1_4.S @@ -0,0 +1,98 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod n_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_mod_n256k1_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Reduction is modulo the group order of the secp256k1 curve. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_n256k1_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_n256k1_4) + .text + +#define z %rdi +#define x %rsi + +#define d0 %rdx +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n0 %rax +#define n1 %r10 +#define n2 %r11 + +#define n2short %r11d + +// Can re-use this as a temporary once we've loaded the input + +#define c %rsi + +S2N_BN_SYMBOL(bignum_mod_n256k1_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load a set of registers [0; n2; n1; n0] = 2^256 - n_256k1 + + movq $0x402da1732fc9bebf, n0 + movq $0x4551231950b75fc4, n1 + movl $1, n2short + +// Load the input and compute x + (2^256 - n_256k1) + + movq (x), d0 + addq n0, d0 + movq 8(x), d1 + adcq n1, d1 + movq 16(x), d2 + adcq n2, d2 + movq 24(x), d3 + adcq $0, d3 + +// Now CF is set iff 2^256 <= x + (2^256 - n_256k1), i.e. iff n_256k1 <= x. +// Create a mask for the condition x < n, and mask the three nontrivial digits +// ready to undo the previous addition with a compensating subtraction + + sbbq c, c + notq c + andq c, n0 + andq c, n1 + andq c, n2 + +// Now subtract mask * (2^256 - n_256k1) again and store + + subq n0, d0 + movq d0, (z) + sbbq n1, d1 + movq d1, 8(z) + sbbq n2, d2 + movq d2, 16(z) + sbbq $0, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mod_p256k1_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mod_p256k1_4.S new file mode 100644 index 00000000000..b6519abf266 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mod_p256k1_4.S @@ -0,0 +1,83 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_mod_p256k1_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_p256k1_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_p256k1_4) + .text + +#define z %rdi +#define x %rsi + +#define d0 %rdx +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 +#define c %r10 + +#define d %rax + + +S2N_BN_SYMBOL(bignum_mod_p256k1_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the inputs as [d3;d2;d1;d0] and let d be an AND of [d3;d2;d1] to +// condense the comparison below. + + movq (x), d0 + movq 8(x), d1 + movq d1, d + movq 16(x), d2 + andq d2, d + movq 24(x), d3 + andq d3, d + +// Compare x >= p_256k1 = 2^256 - 4294968273 using condensed carry: +// we get a carry from the lowest digit and all other digits are 1. +// We end up with c and d as adjusted digits for x - p_256k1 if so. + + movq $4294968273, c + addq d0, c + adcq $0, d + +// If indeed x >= p_256k1 then x := x - p_256k1, using c and d +// Either way, write back to z + + cmovcq c, d0 + movq d0, (z) + cmovcq d, d1 + movq d1, 8(z) + cmovcq d, d2 + movq d2, 16(z) + cmovcq d, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montmul_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montmul_p256k1.S new file mode 100644 index 00000000000..5d0b06b394f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montmul_p256k1.S @@ -0,0 +1,235 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_256k1 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_256k1, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_256k2 (in particular this is true if we are in +// the "usual" case x < p_256k1 and y < p_256k1). +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256k1) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// Copied in or set up + +#define y %rcx + +// A zero register + +#define zero %rbp +#define zeroe %ebp + +// Also used for multiplicative inverse in second part + +#define w %rbp + +// mulpadd(high,low,m) adds %rdx * m to a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rbx as temporaries. + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rbx ; \ + adcxq %rax, low ; \ + adoxq %rbx, high + +// mulpade(high,low,i) adds %rdx * x[i] to a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax as a temporary, assuming high created from scratch +// and that zero has value zero. + +#define mulpade(high,low,m) \ + mulxq m, %rax, high ; \ + adcxq %rax, low ; \ + adoxq zero, high + +S2N_BN_SYMBOL(bignum_montmul_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Copy y into a safe register to start with + + movq %rdx, y + +// Zero a register, which also makes sure we don't get a fake carry-in + + xorl zeroe, zeroe + +// Do the zeroth row, which is a bit different + + movq (y), %rdx + + mulxq (x), %r8, %r9 + mulxq 8(x), %rax, %r10 + addq %rax, %r9 + mulxq 16(x), %rax, %r11 + adcq %rax, %r10 + mulxq 24(x), %rax, %r12 + adcq %rax, %r11 + adcq zero, %r12 + +// Add row 1 + + xorl zeroe, zeroe + movq 8(y), %rdx + mulpadd(%r10,%r9,(x)) + mulpadd(%r11,%r10,8(x)) + mulpadd(%r12,%r11,16(x)) + mulpade(%r13,%r12,24(x)) + adcxq zero, %r13 + +// Add row 2 + + xorl zeroe, zeroe + movq 16(y), %rdx + mulpadd(%r11,%r10,(x)) + mulpadd(%r12,%r11,8(x)) + mulpadd(%r13,%r12,16(x)) + mulpade(%r14,%r13,24(x)); + adcxq zero, %r14 + +// Add row 3 + + xorl zeroe, zeroe + movq 24(y), %rdx + mulpadd(%r12,%r11,(x)) + mulpadd(%r13,%r12,8(x)) + mulpadd(%r14,%r13,16(x)); + mulpade(%r15,%r14,24(x)); + adcxq zero, %r15 + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8] +// Do Montgomery reductions, now using %rcx as a carry-saver. +// A direct carry chain is possible using mulx exclusively, but it +// requires more moves and overall seems to have lower performance. + + movq $0xd838091dd2253531, w + movq $4294968273, %rbx + +// Montgomery reduce row 0 + + movq %rbx, %rax + imulq w, %r8 + mulq %r8 + subq %rdx, %r9 + sbbq %rcx, %rcx + +// Montgomery reduce row 1 + + movq %rbx, %rax + imulq w, %r9 + mulq %r9 + negq %rcx + sbbq %rdx, %r10 + sbbq %rcx, %rcx + +// Montgomery reduce row 2 + + movq %rbx, %rax + imulq w, %r10 + mulq %r10 + negq %rcx + sbbq %rdx, %r11 + sbbq %rcx, %rcx + +// Montgomery reduce row 3 + + movq %rbx, %rax + imulq w, %r11 + mulq %r11 + negq %rcx + +// Now [%r15,%r14,%r13,%r12] := [%r15,%r14,%r13,%r12] + [%r11,%r10,%r9,%r8] - (%rdx + CF) + + sbbq %rdx, %r8 + sbbq $0, %r9 + sbbq $0, %r10 + sbbq $0, %r11 + + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + sbbq w, w + +// Let b be the top carry captured just above as w = (2^64-1) * b +// Now if [b,%r15,%r14,%r13,%r12] >= p_256k1, subtract p_256k1, i.e. add 4294968273 +// and either way throw away the top word. [b,%r15,%r14,%r13,%r12] - p_256k1 = +// [(b - 1),%r15,%r14,%r13,%r12] + 4294968273. If [%r15,%r14,%r13,%r12] + 4294968273 +// gives carry flag CF then >= comparison is top = 0 <=> b - 1 + CF = 0 which +// is equivalent to b \/ CF, and so to (2^64-1) * b + (2^64 - 1) + CF >= 2^64 + + movq %r12, %r8 + addq %rbx, %r8 + movq %r13, %r9 + adcq $0, %r9 + movq %r14, %r10 + adcq $0, %r10 + movq %r15, %r11 + adcq $0, %r11 + + adcq $-1, w + +// Write everything back + + cmovcq %r8, %r12 + movq %r12, (z) + cmovcq %r9, %r13 + movq %r13, 8(z) + cmovcq %r10, %r14 + movq %r14, 16(z) + cmovcq %r11, %r15 + movq %r15, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montmul_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montmul_p256k1_alt.S new file mode 100644 index 00000000000..81c7f805f2c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montmul_p256k1_alt.S @@ -0,0 +1,235 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_256k1 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_p256k1_alt +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_256k1, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_256k2 (in particular this is true if we are in +// the "usual" case x < p_256k1 and y < p_256k1). +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_p256k1_alt) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// Copied in or set up + +#define y %rcx + +// Re-used for constants in second part + +#define w %rsi + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A minutely shorter form for when c = 0 initially + +#define combadz(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq c, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h + +S2N_BN_SYMBOL(bignum_montmul_p256k1_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Copy y into a safe register to start with + + movq %rdx, y + +// Start the window as [%r10;%r9;%r8] with 00 product + + movq (x), %rax + mulq (y) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + +// Column 1 + + xorq %r11, %r11 + combads(%r10,%r9,(x),8(y)) + combadz(%r11,%r10,%r9,8(x),(y)) + +// Column 2 + + xorq %r12, %r12 + combadz(%r12,%r11,%r10,(x),16(y)) + combadd(%r12,%r11,%r10,8(x),8(y)) + combadd(%r12,%r11,%r10,16(x),(y)) + +// Column 3 + + xorq %r13, %r13 + combadz(%r13,%r12,%r11,(x),24(y)) + combadd(%r13,%r12,%r11,8(x),16(y)) + combadd(%r13,%r12,%r11,16(x),8(y)) + combadd(%r13,%r12,%r11,24(x),(y)) + +// Column 4 + + xorq %r14, %r14 + combadz(%r14,%r13,%r12,8(x),24(y)) + combadd(%r14,%r13,%r12,16(x),16(y)) + combadd(%r14,%r13,%r12,24(x),8(y)) + +// Column 5 + + xorq %r15, %r15 + combadz(%r15,%r14,%r13,16(x),24(y)) + combadd(%r15,%r14,%r13,24(x),16(y)) + +// Final work for columns 6 and 7 + + movq 24(x), %rax + mulq 24(y) + addq %rax, %r14 + adcq %rdx, %r15 + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8] +// Do Montgomery reductions, now using %rcx as a carry-saver. + + movq $0xd838091dd2253531, w + movq $4294968273, %rbx + +// Montgomery reduce row 0 + + movq %rbx, %rax + imulq w, %r8 + mulq %r8 + subq %rdx, %r9 + sbbq %rcx, %rcx + +// Montgomery reduce row 1 + + movq %rbx, %rax + imulq w, %r9 + mulq %r9 + negq %rcx + sbbq %rdx, %r10 + sbbq %rcx, %rcx + +// Montgomery reduce row 2 + + movq %rbx, %rax + imulq w, %r10 + mulq %r10 + negq %rcx + sbbq %rdx, %r11 + sbbq %rcx, %rcx + +// Montgomery reduce row 3 + + movq %rbx, %rax + imulq w, %r11 + mulq %r11 + negq %rcx + +// Now [%r15,%r14,%r13,%r12] := [%r15,%r14,%r13,%r12] + [%r11,%r10,%r9,%r8] - (%rdx + CF) + + sbbq %rdx, %r8 + sbbq $0, %r9 + sbbq $0, %r10 + sbbq $0, %r11 + + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + sbbq w, w + +// Let b be the top carry captured just above as w = (2^64-1) * b +// Now if [b,%r15,%r14,%r13,%r12] >= p_256k1, subtract p_256k1, i.e. add 4294968273 +// and either way throw away the top word. [b,%r15,%r14,%r13,%r12] - p_256k1 = +// [(b - 1),%r15,%r14,%r13,%r12] + 4294968273. If [%r15,%r14,%r13,%r12] + 4294968273 +// gives carry flag CF then >= comparison is top = 0 <=> b - 1 + CF = 0 which +// is equivalent to b \/ CF, and so to (2^64-1) * b + (2^64 - 1) + CF >= 2^64 + + movq %r12, %r8 + addq %rbx, %r8 + movq %r13, %r9 + adcq $0, %r9 + movq %r14, %r10 + adcq $0, %r10 + movq %r15, %r11 + adcq $0, %r11 + + adcq $-1, w + +// Write everything back + + cmovcq %r8, %r12 + movq %r12, (z) + cmovcq %r9, %r13 + movq %r13, 8(z) + cmovcq %r10, %r14 + movq %r14, 16(z) + cmovcq %r11, %r15 + movq %r15, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montsqr_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montsqr_p256k1.S new file mode 100644 index 00000000000..f1c8a62730f --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montsqr_p256k1.S @@ -0,0 +1,213 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_256k1, assuming x^2 <= 2^256 * p_256k1, which +// is guaranteed in particular if x < p_256k1 initially (the "intended" case). +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256k1) + .text + +#define z %rdi +#define x %rsi + +// Use this fairly consistently for a zero + +#define zero %rbp +#define zeroe %ebp + +// Also use the same register for multiplicative inverse in Montgomery stage + +#define w %rbp + +// Add %rdx * m into a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rbx as temporaries + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rbx ; \ + adcxq %rax, low ; \ + adoxq %rbx, high + +S2N_BN_SYMBOL(bignum_montsqr_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Compute [%r15;%r8] = [00] which we use later, but mainly +// set up an initial window [%r14;...;%r9] = [23;03;01] + + movq (x), %rdx + mulxq %rdx, %r8, %r15 + mulxq 8(x), %r9, %r10 + mulxq 24(x), %r11, %r12 + movq 16(x), %rdx + mulxq 24(x), %r13, %r14 + +// Clear our zero register, and also initialize the flags for the carry chain + + xorl zeroe, zeroe + +// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible) +// This gives all the "heterogeneous" terms of the squaring ready to double + + mulpadd(%r11,%r10,(x)) + mulpadd(%r12,%r11,8(x)) + movq 24(x), %rdx + mulpadd(%r13,%r12,8(x)) + adcxq zero, %r13 + adoxq zero, %r14 + adcq zero, %r14 + +// Double and add to the 00 + 11 + 22 + 33 terms + + xorl zeroe, zeroe + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 8(x), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 16(x), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 24(x), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq zero, %r15 + adoxq zero, %r15 + +// Now we have the full 8-digit square 2^256 * h + l where +// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8] +// Do Montgomery reductions, now using %rcx as a carry save + + movq $0xd838091dd2253531, w + movq $4294968273, %rbx + +// Montgomery reduce row 0 + + movq %rbx, %rax + imulq w, %r8 + mulq %r8 + subq %rdx, %r9 + sbbq %rcx, %rcx + +// Montgomery reduce row 1 + + movq %rbx, %rax + imulq w, %r9 + mulq %r9 + negq %rcx + sbbq %rdx, %r10 + sbbq %rcx, %rcx + +// Montgomery reduce row 2 + + movq %rbx, %rax + imulq w, %r10 + mulq %r10 + negq %rcx + sbbq %rdx, %r11 + sbbq %rcx, %rcx + +// Montgomery reduce row 3 + + movq %rbx, %rax + imulq w, %r11 + mulq %r11 + negq %rcx + +// Now [%r15,%r14,%r13,%r12] := [%r15,%r14,%r13,%r12] + [%r11,%r10,%r9,%r8] - (%rdx + CF) + + sbbq %rdx, %r8 + sbbq $0, %r9 + sbbq $0, %r10 + sbbq $0, %r11 + + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + sbbq w, w + +// Let b be the top carry captured just above as w = (2^64-1) * b +// Now if [b,%r15,%r14,%r13,%r12] >= p_256k1, subtract p_256k1, i.e. add 4294968273 +// and either way throw away the top word. [b,%r15,%r14,%r13,%r12] - p_256k1 = +// [(b - 1),%r15,%r14,%r13,%r12] + 4294968273. If [%r15,%r14,%r13,%r12] + 4294968273 +// gives carry flag CF then >= comparison is top = 0 <=> b - 1 + CF = 0 which +// is equivalent to b \/ CF, and so to (2^64-1) * b + (2^64 - 1) + CF >= 2^64 + + movq %r12, %r8 + addq %rbx, %r8 + movq %r13, %r9 + adcq $0, %r9 + movq %r14, %r10 + adcq $0, %r10 + movq %r15, %r11 + adcq $0, %r11 + + adcq $-1, w + +// Write everything back + + cmovcq %r8, %r12 + movq %r12, (z) + cmovcq %r9, %r13 + movq %r13, 8(z) + cmovcq %r10, %r14 + movq %r14, 16(z) + cmovcq %r11, %r15 + movq %r15, 24(z) + +// Restore saved registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montsqr_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montsqr_p256k1_alt.S new file mode 100644 index 00000000000..ba64eda56cc --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_montsqr_p256k1_alt.S @@ -0,0 +1,218 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_p256k1_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_256k1, assuming x^2 <= 2^256 * p_256k1, which +// is guaranteed in particular if x < p_256k1 initially (the "intended" case). +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_p256k1_alt) + .text + +#define z %rdi +#define x %rsi + +// Re-used for constants in second part + +#define w %rsi + +// Macro for the key "multiply and add to (c,h,l)" step, for square term + +#define combadd1(c,h,l,numa) \ + movq numa, %rax ; \ + mulq %rax; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa) \ + movq numa, %rax ; \ + mulq %rax; \ + addq %rax, l ; \ + adcq %rdx, h + +// A version doubling before adding, for non-square terms + +#define combadd2(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0, c ; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +S2N_BN_SYMBOL(bignum_montsqr_p256k1_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Result term 0 + + movq (x), %rax + mulq %rax + + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + +// Result term 1 + + xorq %r11, %r11 + combadd2(%r11,%r10,%r9,(x),8(x)) + +// Result term 2 + + xorq %r12, %r12 + combadd1(%r12,%r11,%r10,8(x)) + combadd2(%r12,%r11,%r10,(x),16(x)) + +// Result term 3 + + xorq %r13, %r13 + combadd2(%r13,%r12,%r11,(x),24(x)) + combadd2(%r13,%r12,%r11,8(x),16(x)) + +// Result term 4 + + xorq %r14, %r14 + combadd2(%r14,%r13,%r12,8(x),24(x)) + combadd1(%r14,%r13,%r12,16(x)) + +// Result term 5 + + xorq %r15, %r15 + combadd2(%r15,%r14,%r13,16(x),24(x)) + +// Result term 6 + + combads(%r15,%r14,24(x)) + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8] +// Do Montgomery reductions, now using %rcx as a carry-saver. + + movq $0xd838091dd2253531, w + movq $4294968273, %rbx + +// Montgomery reduce row 0 + + movq %rbx, %rax + imulq w, %r8 + mulq %r8 + subq %rdx, %r9 + sbbq %rcx, %rcx + +// Montgomery reduce row 1 + + movq %rbx, %rax + imulq w, %r9 + mulq %r9 + negq %rcx + sbbq %rdx, %r10 + sbbq %rcx, %rcx + +// Montgomery reduce row 2 + + movq %rbx, %rax + imulq w, %r10 + mulq %r10 + negq %rcx + sbbq %rdx, %r11 + sbbq %rcx, %rcx + +// Montgomery reduce row 3 + + movq %rbx, %rax + imulq w, %r11 + mulq %r11 + negq %rcx + +// Now [%r15,%r14,%r13,%r12] := [%r15,%r14,%r13,%r12] + [%r11,%r10,%r9,%r8] - (%rdx + CF) + + sbbq %rdx, %r8 + sbbq $0, %r9 + sbbq $0, %r10 + sbbq $0, %r11 + + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + sbbq w, w + +// Let b be the top carry captured just above as w = (2^64-1) * b +// Now if [b,%r15,%r14,%r13,%r12] >= p_256k1, subtract p_256k1, i.e. add 4294968273 +// and either way throw away the top word. [b,%r15,%r14,%r13,%r12] - p_256k1 = +// [(b - 1),%r15,%r14,%r13,%r12] + 4294968273. If [%r15,%r14,%r13,%r12] + 4294968273 +// gives carry flag CF then >= comparison is top = 0 <=> b - 1 + CF = 0 which +// is equivalent to b \/ CF, and so to (2^64-1) * b + (2^64 - 1) + CF >= 2^64 + + movq %r12, %r8 + addq %rbx, %r8 + movq %r13, %r9 + adcq $0, %r9 + movq %r14, %r10 + adcq $0, %r10 + movq %r15, %r11 + adcq $0, %r11 + + adcq $-1, w + +// Write everything back + + cmovcq %r8, %r12 + movq %r12, (z) + cmovcq %r9, %r13 + movq %r13, 8(z) + cmovcq %r10, %r14 + movq %r14, 16(z) + cmovcq %r11, %r15 + movq %r15, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mul_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mul_p256k1.S new file mode 100644 index 00000000000..b520622a177 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mul_p256k1.S @@ -0,0 +1,184 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply modulo p_256k1, z := (x * y) mod p_256k1 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_mul_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p256k1) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// Copied in or set up + +#define y %rcx + +// A zero register + +#define zero %rbp +#define zeroe %ebp + +// mulpadd(high,low,m) adds %rdx * m to a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rbx as temporaries. + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rbx ; \ + adcxq %rax, low ; \ + adoxq %rbx, high + +// mulpade(high,low,m) adds %rdx * m to a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax as a temporary, assuming high created from scratch +// and that zero has value zero. + +#define mulpade(high,low,m) \ + mulxq m, %rax, high ; \ + adcxq %rax, low ; \ + adoxq zero, high + +S2N_BN_SYMBOL(bignum_mul_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Copy y into a safe register to start with + + movq %rdx, y + +// Zero a register, which also makes sure we don't get a fake carry-in + + xorl zeroe, zeroe + +// Do the zeroth row, which is a bit different + + movq (y), %rdx + + mulxq (x), %r8, %r9 + mulxq 8(x), %rax, %r10 + addq %rax, %r9 + mulxq 16(x), %rax, %r11 + adcq %rax, %r10 + mulxq 24(x), %rax, %r12 + adcq %rax, %r11 + adcq zero, %r12 + +// Add row 1 + + xorl zeroe, zeroe + movq 8(y), %rdx + mulpadd(%r10,%r9,(x)) + mulpadd(%r11,%r10,8(x)) + mulpadd(%r12,%r11,16(x)) + mulpade(%r13,%r12,24(x)) + adcxq zero, %r13 + +// Add row 2 + + xorl zeroe, zeroe + movq 16(y), %rdx + mulpadd(%r11,%r10,(x)) + mulpadd(%r12,%r11,8(x)) + mulpadd(%r13,%r12,16(x)) + mulpade(%r14,%r13,24(x)); + adcxq zero, %r14 + +// Add row 3 + + xorl zeroe, zeroe + movq 24(y), %rdx + mulpadd(%r12,%r11,(x)) + mulpadd(%r13,%r12,8(x)) + mulpadd(%r14,%r13,16(x)); + mulpade(%r15,%r14,24(x)); + adcxq zero, %r15 + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8] +// and this is == 4294968273 * h + l (mod p_256k1) + + movq $4294968273, %rdx + + xorl zeroe, zeroe + + mulpadd(%r9,%r8,%r12) + mulpadd(%r10,%r9,%r13) + mulpadd(%r11,%r10,%r14) + mulpade(%r12,%r11,%r15) + adcxq zero, %r12 + +// Now we have reduced to 5 digits, 2^256 * h + l = [%r12,%r11,%r10,%r9,%r8] +// Use q = h + 1 as the initial quotient estimate, either right or 1 too big. + + leaq 1(%r12), %rax + mulxq %rax, %rax, %rbx + addq %rax, %r8 + adcq %rbx, %r9 + adcq zero, %r10 + adcq zero, %r11 + +// Now the effective answer is 2^256 * (CF - 1) + [%r11,%r10,%r9,%r8] +// So we correct if CF = 0 by subtracting 4294968273, i.e. by +// adding p_256k1 to the "full" answer + + cmovcq zero, %rdx + subq %rdx, %r8 + sbbq zero, %r9 + sbbq zero, %r10 + sbbq zero, %r11 + +// Write everything back + + movq %r8, (z) + movq %r9, 8(z) + movq %r10, 16(z) + movq %r11, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mul_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mul_p256k1_alt.S new file mode 100644 index 00000000000..f63667dcbf8 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_mul_p256k1_alt.S @@ -0,0 +1,211 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply modulo p_256k1, z := (x * y) mod p_256k1 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_mul_p256k1_alt +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_p256k1_alt) + .text + +// These are actually right + +#define z %rdi +#define x %rsi + +// Copied in or set up + +#define y %rcx + +// Re-use input pointers later for constant and top carry + +#define d %rsi +#define c %rcx + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A minutely shorter form for when c = 0 initially + +#define combadz(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq c, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h + +S2N_BN_SYMBOL(bignum_mul_p256k1_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Copy y into a safe register to start with + + movq %rdx, y + +// Start the window as [%r10;%r9;%r8] with 00 product + + movq (x), %rax + mulq (y) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + +// Column 1 + + xorq %r11, %r11 + combads(%r10,%r9,(x),8(y)) + combadd(%r11,%r10,%r9,8(x),(y)) + +// Column 2 + + xorq %r12, %r12 + combadz(%r12,%r11,%r10,(x),16(y)) + combadd(%r12,%r11,%r10,8(x),8(y)) + combadd(%r12,%r11,%r10,16(x),(y)) + +// Column 3 + + xorq %r13, %r13 + combadz(%r13,%r12,%r11,(x),24(y)) + combadd(%r13,%r12,%r11,8(x),16(y)) + combadd(%r13,%r12,%r11,16(x),8(y)) + combadd(%r13,%r12,%r11,24(x),(y)) + +// Column 4 + + xorq %r14, %r14 + combadz(%r14,%r13,%r12,8(x),24(y)) + combadd(%r14,%r13,%r12,16(x),16(y)) + combadd(%r14,%r13,%r12,24(x),8(y)) + +// Column 5 + + xorq %r15, %r15 + combadz(%r15,%r14,%r13,16(x),24(y)) + combadd(%r15,%r14,%r13,24(x),16(y)) + +// Final work for columns 6 and 7 + + movq 24(x), %rax + mulq 24(y) + addq %rax, %r14 + adcq %rdx, %r15 + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8] +// and this is == 4294968273 * h + l (mod p_256k1) + + movq $4294968273, d + + movq %r12, %rax + mulq d + addq %rax, %r8 + adcq %rdx, %r9 + sbbq c, c + + movq %r13, %rax + mulq d + subq c, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq c, c + + movq %r14, %rax + mulq d + subq c, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq c, c + + movq %r15, %rax + mulq d + subq c, %rdx + xorq c, c + addq %rax, %r11 + movq %rdx, %r12 + adcq c, %r12 + +// Now we have reduced to 5 digits, 2^256 * h + l = [%r12,%r11,%r10,%r9,%r8] +// Use q = h + 1 as the initial quotient estimate, either right or 1 too big. + + leaq 1(%r12), %rax + mulq d + addq %rax, %r8 + adcq %rdx, %r9 + adcq c, %r10 + adcq c, %r11 + +// Now the effective answer is 2^256 * (CF - 1) + [%r11,%r10,%r9,%r8] +// So we correct if CF = 0 by subtracting 4294968273, i.e. by +// adding p_256k1 to the "full" answer + + sbbq %rax, %rax + notq %rax + andq d, %rax + subq %rax, %r8 + sbbq c, %r9 + sbbq c, %r10 + sbbq c, %r11 + +// Write everything back + + movq %r8, (z) + movq %r9, 8(z) + movq %r10, 16(z) + movq %r11, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_neg_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_neg_p256k1.S new file mode 100644 index 00000000000..f7594ffe5b7 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_neg_p256k1.S @@ -0,0 +1,84 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Negate modulo p_256k1, z := (-x) mod p_256k1, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_neg_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_neg_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_neg_p256k1) + .text + +#define z %rdi +#define x %rsi + +#define q %rdx +#define n0 %rax +#define n1 %rcx +#define n2 %r8 +#define n3 %r9 + +#define c %r10 + +#define qshort %esi + +S2N_BN_SYMBOL(bignum_neg_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the 4 digits of x and let q be an OR of all the digits + + movq (x), n0 + movq n0, q + movq 8(x), n1 + orq n1, q + movq 16(x), n2 + orq n2, q + movq 24(x), n3 + orq n3, q + +// Turn q into a strict bitmask, and c a masked constant -4294968273 + + negq q + sbbq q, q + movq $-4294968273, c + andq q, c + +// Now just do [2^256 - 4294968273] - x where the constant is masked + + subq n0, c + movq c, (z) + movq q, c + sbbq n1, c + movq c, 8(z) + movq q, c + sbbq n2, c + movq c, 16(z) + sbbq n3, q + movq q, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_optneg_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_optneg_p256k1.S new file mode 100644 index 00000000000..657c742f8f3 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_optneg_p256k1.S @@ -0,0 +1,94 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally negate modulo p_256k1, z := (-x) mod p_256k1 (if p nonzero) or +// z := x (if p zero), assuming x reduced +// Inputs p, x[4]; output z[4] +// +// extern void bignum_optneg_p256k1 +// (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = p, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = p, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_p256k1) + .text + +#define z %rdi +#define q %rsi +#define x %rdx + +#define n0 %rax +#define n1 %rcx +#define n2 %r8 +#define n3 %r9 + +#define c %r10 + +#define qshort %esi + +S2N_BN_SYMBOL(bignum_optneg_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Load the 4 digits of x and let c be an OR of all the digits + + movq (x), n0 + movq n0, c + movq 8(x), n1 + orq n1, c + movq 16(x), n2 + orq n2, c + movq 24(x), n3 + orq n3, c + +// Turn q into a strict bitmask. Force it to zero if the input is zero, +// to avoid giving -0 = p_256k1, which is not reduced though correct modulo. + + cmovzq c, q + negq q + sbbq q, q + +// We want z := if q then (2^256 - 4294968273) - x else x +// which is: [if q then ~x else x] - [if q then 4294968272 else 0] + + xorq q, n0 + xorq q, n1 + xorq q, n2 + xorq q, n3 + + movq $4294968272, c + andq q, c + xorl qshort, qshort + + subq c, n0 + movq n0, (z) + sbbq q, n1 + movq n1, 8(z) + sbbq q, n2 + movq n2, 16(z) + sbbq q, n3 + movq n3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sqr_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sqr_p256k1.S new file mode 100644 index 00000000000..21959c8ec99 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sqr_p256k1.S @@ -0,0 +1,175 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square modulo p_256k1, z := (x^2) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_sqr_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p256k1) + .text + +#define z %rdi +#define x %rsi + +// Use this fairly consistently for a zero + +#define zero %rbx +#define zeroe %ebx + +// Add %rdx * m into a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rcx as temporaries + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rcx ; \ + adcxq %rax, low ; \ + adoxq %rcx, high + +// mulpade(high,low,m) adds %rdx * m to a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax as a temporary, assuming high created from scratch +// and that zero has value zero. + +#define mulpade(high,low,m) \ + mulxq m, %rax, high ; \ + adcxq %rax, low ; \ + adoxq zero, high + +S2N_BN_SYMBOL(bignum_sqr_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Compute [%r15;%r8] = [00] which we use later, but mainly +// set up an initial window [%r14;...;%r9] = [23;03;01] + + movq (x), %rdx + mulxq %rdx, %r8, %r15 + mulxq 8(x), %r9, %r10 + mulxq 24(x), %r11, %r12 + movq 16(x), %rdx + mulxq 24(x), %r13, %r14 + +// Clear our zero register, and also initialize the flags for the carry chain + + xorl zeroe, zeroe + +// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible) +// This gives all the "heterogeneous" terms of the squaring ready to double + + mulpadd(%r11,%r10,(x)) + mulpadd(%r12,%r11,8(x)) + movq 24(x), %rdx + mulpadd(%r13,%r12,8(x)) + adcxq zero, %r13 + adoxq zero, %r14 + adcq zero, %r14 + +// Double and add to the 00 + 11 + 22 + 33 terms + + xorl zeroe, zeroe + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 8(x), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 16(x), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 24(x), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq zero, %r15 + adoxq zero, %r15 + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8] +// and this is == 4294968273 * h + l (mod p_256k1) + + movq $4294968273, %rdx + + xorl zeroe, zeroe + + mulpadd(%r9,%r8,%r12) + mulpadd(%r10,%r9,%r13) + mulpadd(%r11,%r10,%r14) + mulpade(%r12,%r11,%r15) + adcxq zero, %r12 + +// Now we have reduced to 5 digits, 2^256 * h + l = [%r12,%r11,%r10,%r9,%r8] +// Use q = h + 1 as the initial quotient estimate, either right or 1 too big. + + leaq 1(%r12), %rax + mulxq %rax, %rax, %rcx + addq %rax, %r8 + adcq %rcx, %r9 + adcq zero, %r10 + adcq zero, %r11 + +// Now the effective answer is 2^256 * (CF - 1) + [%r11,%r10,%r9,%r8] +// So we correct if CF = 0 by subtracting 4294968273, i.e. by +// adding p_256k1 to the "full" answer + + sbbq %rax, %rax + notq %rax + andq %rdx, %rax + subq %rax, %r8 + sbbq zero, %r9 + sbbq zero, %r10 + sbbq zero, %r11 + +// Write everything back + + movq %r8, (z) + movq %r9, 8(z) + movq %r10, 16(z) + movq %r11, 24(z) + +// Restore saved registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sqr_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sqr_p256k1_alt.S new file mode 100644 index 00000000000..cebcd031d80 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sqr_p256k1_alt.S @@ -0,0 +1,195 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Square modulo p_256k1, z := (x^2) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_sqr_p256k1_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sqr_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sqr_p256k1_alt) + .text + +#define z %rdi +#define x %rsi + +// Re-use input pointer later for constant + +#define d %rsi +#define c %rcx + +// Macro for the key "multiply and add to (c,h,l)" step, for square term + +#define combadd1(c,h,l,numa) \ + movq numa, %rax ; \ + mulq %rax; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa) \ + movq numa, %rax ; \ + mulq %rax; \ + addq %rax, l ; \ + adcq %rdx, h + +// A version doubling before adding, for non-square terms + +#define combadd2(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0, c ; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +S2N_BN_SYMBOL(bignum_sqr_p256k1_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Result term 0 + + movq (x), %rax + mulq %rax + + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + +// Result term 1 + + xorq %r11, %r11 + combadd2(%r11,%r10,%r9,(x),8(x)) + +// Result term 2 + + xorq %r12, %r12 + combadd1(%r12,%r11,%r10,8(x)) + combadd2(%r12,%r11,%r10,(x),16(x)) + +// Result term 3 + + xorq %r13, %r13 + combadd2(%r13,%r12,%r11,(x),24(x)) + combadd2(%r13,%r12,%r11,8(x),16(x)) + +// Result term 4 + + xorq %r14, %r14 + combadd2(%r14,%r13,%r12,8(x),24(x)) + combadd1(%r14,%r13,%r12,16(x)) + +// Result term 5 + + xorq %r15, %r15 + combadd2(%r15,%r14,%r13,16(x),24(x)) + +// Result term 6 + + combads(%r15,%r14,24(x)) + +// Now we have the full 8-digit product 2^256 * h + l where +// h = [%r15,%r14,%r13,%r12] and l = [%r11,%r10,%r9,%r8] +// and this is == 4294968273 * h + l (mod p_256k1) + + movq $4294968273, d + + movq %r12, %rax + mulq d + addq %rax, %r8 + adcq %rdx, %r9 + sbbq c, c + + movq %r13, %rax + mulq d + subq c, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq c, c + + movq %r14, %rax + mulq d + subq c, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq c, c + + movq %r15, %rax + mulq d + subq c, %rdx + xorq c, c + addq %rax, %r11 + movq %rdx, %r12 + adcq c, %r12 + +// Now we have reduced to 5 digits, 2^256 * h + l = [%r12,%r11,%r10,%r9,%r8] +// Use q = h + 1 as the initial quotient estimate, either right or 1 too big. + + leaq 1(%r12), %rax + mulq d + addq %rax, %r8 + adcq %rdx, %r9 + adcq c, %r10 + adcq c, %r11 + +// Now the effective answer is 2^256 * (CF - 1) + [%r11,%r10,%r9,%r8] +// So we correct if CF = 0 by subtracting 4294968273, i.e. by +// adding p_256k1 to the "full" answer + + sbbq %rax, %rax + notq %rax + andq d, %rax + subq %rax, %r8 + sbbq c, %r9 + sbbq c, %r10 + sbbq c, %r11 + +// Write everything back + + movq %r8, (z) + movq %r9, 8(z) + movq %r10, 16(z) + movq %r11, 24(z) + +// Restore registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sub_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sub_p256k1.S new file mode 100644 index 00000000000..fe4582592c6 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_sub_p256k1.S @@ -0,0 +1,87 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Subtract modulo p_256k1, z := (x - y) mod p_256k1 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_sub_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_p256k1) + .text + +#define z %rdi +#define x %rsi +#define y %rdx + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +#define zero %rax +#define zeroe %eax +#define c %rcx + +S2N_BN_SYMBOL(bignum_sub_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Zero a register first + + xorl zeroe, zeroe + +// Load and subtract the two inputs as [d3;d2;d1;d0] = x - y (modulo 2^256) + + movq (x), d0 + subq (y), d0 + movq 8(x), d1 + sbbq 8(y), d1 + movq 16(x), d2 + sbbq 16(y), d2 + movq 24(x), d3 + sbbq 24(y), d3 + +// Now if x < y we want to add back p_256k1, which staying within 4 digits +// means subtracting 4294968273, since p_256k1 = 2^256 - 4294968273. +// Let c be that constant 4294968273 when x < y, zero otherwise. + + movq $4294968273, c + cmovncq zero, c + +// Now correct by adding masked p_256k1, i.e. subtracting c, and write back + + subq c, d0 + movq d0, (z) + sbbq zero, d1 + movq d1, 8(z) + sbbq zero, d2 + movq d2, 16(z) + sbbq zero, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_tomont_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_tomont_p256k1.S new file mode 100644 index 00000000000..92f97e05677 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_tomont_p256k1.S @@ -0,0 +1,105 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert to Montgomery form z := (2^256 * x) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_tomont_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256k1) + .text + +#define z %rdi +#define x %rsi + +#define d %rdx + +#define a %rax +#define ashort %eax +#define q %rax + +#define d0 %rcx +#define d1 %r8 +#define d2 %r9 +#define d3 %r10 + +// Re-use the x argument later on when it's no longer needed + +#define h %rsi +#define c %rsi + +S2N_BN_SYMBOL(bignum_tomont_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Since 2^256 == 4294968273 (mod p_256k1) we more or less just set +// m = 4294968273 then devolve to a variant of bignum_cmul_p256k1; +// the logic that q = h + 1 < 2^64 and hence doesn't wrap still holds +// since the multiplier 4294968273 is known to be much less than 2^64. +// We keep this constant in %rdx throughout as it's used repeatedly. + + movq $4294968273, d + +// Multiply, accumulating the result as 2^256 * h + [d3;d2;d1;d0] +// But immediately add 1 to h to get q = h + 1 as the quotient estimate. + + mulxq (x), d0, d1 + mulxq 8(x), a, d2 + addq a, d1 + mulxq 16(x), a, d3 + adcq a, d2 + mulxq 24(x), a, h + adcq a, d3 + adcq $1, h + +// Now the quotient estimate is q = h + 1, and then we do the reduction, +// writing z = [d3;d2;d1;d0], as z' = (2^256 * h + z) - q * p_256k1 = +// (2^256 * h + z) - q * (2^256 - 4294968273) = -2^256 + (z + 4294968273 * q) + + mulxq h, a, c + + addq a, d0 + adcq c, d1 + adcq $0, d2 + adcq $0, d3 + +// Because of the implicit -2^256, CF means >= 0 so z' is the answer; ~CF +// means z' < 0 so we add p_256k1, which in 4 digits means subtracting c. + + movq $0, a + cmovcq a, d + + subq d, d0 + movq d0, (z) + sbbq a, d1 + movq d1, 8(z) + sbbq a, d2 + movq d2, 16(z) + sbbq a, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_tomont_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_tomont_p256k1_alt.S new file mode 100644 index 00000000000..572a7883083 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_tomont_p256k1_alt.S @@ -0,0 +1,115 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert to Montgomery form z := (2^256 * x) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_tomont_p256k1_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_p256k1_alt) + .text + +#define z %rdi +#define x %rsi + +#define c %rcx +#define d %rdx +#define h %rdx + +#define a %rax +#define ashort %eax +#define q %rax + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %rsi + +S2N_BN_SYMBOL(bignum_tomont_p256k1_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Since 2^256 == 4294968273 (mod p_256k1) we more or less just set +// m = 4294968273 then devolve to a variant of bignum_cmul_p256k1; +// the logic that q = h + 1 < 2^64 and hence doesn't wrap still holds +// since the multiplier 4294968273 is known to be much less than 2^64. +// We keep this constant in %rcx throughout as it's used repeatedly. + + movq $4294968273, c + +// Multiply, accumulating the result as 2^256 * h + [d3;d2;d1;d0] + + movq (x), a + mulq c + movq a, d0 + movq d, d1 + + movq 8(x), a + xorq d2, d2 + mulq c + addq a, d1 + adcq d, d2 + + movq 16(x), a + mulq c + addq a, d2 + adcq $0, d + + movq 24(x), a + movq d, d3 + mulq c + addq a, d3 + adcq $0, h + +// Now the quotient estimate is q = h + 1, and then we do the reduction, +// writing z = [d3;d2;d1;d0], as z' = (2^256 * h + z) - q * p_256k1 = +// (2^256 * h + z) - q * (2^256 - 4294968273) = -2^256 + (z + 4294968273 * q) + + leaq 1(h), q + mulq c + + addq %rax, d0 + adcq %rdx, d1 + adcq $0, d2 + adcq $0, d3 + +// Because of the implicit -2^256, CF means >= 0 so z' is the answer; ~CF +// means z' < 0 so we add p_256k1, which in 4 digits means subtracting c. + + movq $0, a + cmovcq a, c + + subq c, d0 + movq d0, (z) + sbbq a, d1 + movq d1, 8(z) + sbbq a, d2 + movq d2, 16(z) + sbbq a, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_triple_p256k1.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_triple_p256k1.S new file mode 100644 index 00000000000..f9f3ef3cd49 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_triple_p256k1.S @@ -0,0 +1,120 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Triple modulo p_256k1, z := (3 * x) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_triple_p256k1 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The input x can be any 4-digit bignum, not necessarily reduced modulo +// p_256k1, and the result is always fully reduced, z = (3 * x) mod p_256k1. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256k1) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256k1) + .text + +#define z %rdi +#define x %rsi + +// Main digits of intermediate results + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +// Quotient estimate = top of product + 1 + +#define q %rdx + +// Other temporary variables and their short version + +#define a %rax +#define c %rcx + +#define ashort %eax +#define qshort %edx + +S2N_BN_SYMBOL(bignum_triple_p256k1): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// First do the multiplication by 3, getting z = [h; d3; ...; d0] +// but immediately form the quotient estimate q = h + 1 + + xorl ashort, ashort + + movq (x), q + movq q, d0 + adcxq q, q + adoxq q, d0 + movq 8(x), q + movq q, d1 + adcxq q, q + adoxq q, d1 + movq 16(x), q + movq q, d2 + adcxq q, q + adoxq q, d2 + movq 24(x), q + movq q, d3 + adcxq q, q + adoxq q, d3 + +// For this limited range a simple quotient estimate of q = h + 1 works, where +// h = floor(z / 2^256). Then -p_256k1 <= z - q * p_256k1 < p_256k1. + + movl $1, qshort + adcxq a, q + adoxq a, q + +// Initial subtraction of z - q * p_256k1, actually by adding q * 4294968273. + + movq $4294968273, c + xorq a, a + imulq c, q + addq q, d0 + adcq a, d1 + adcq a, d2 + adcq a, d3 + +// With z = 2^256 * h + l, the underlying result z' is actually +// (2^256 * h + l) - q * (2^256 - 4294968273) = (l + q * 4294968273) - 2^256 +// so carry-clear <=> z' is negative. Correct by subtracting in that case. +// In any case, write final result to z as we go. + + cmovcq a, c + + subq c, d0 + movq d0, (z) + sbbq a, d1 + movq d1, 8(z) + sbbq a, d2 + movq d2, 16(z) + sbbq a, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_triple_p256k1_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_triple_p256k1_alt.S new file mode 100644 index 00000000000..ebd15d3c43a --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/bignum_triple_p256k1_alt.S @@ -0,0 +1,122 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Triple modulo p_256k1, z := (3 * x) mod p_256k1 +// Input x[4]; output z[4] +// +// extern void bignum_triple_p256k1_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The input x can be any 4-digit bignum, not necessarily reduced modulo +// p_256k1, and the result is always fully reduced, z = (3 * x) mod p_256k1. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_p256k1_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_p256k1_alt) + .text + +#define z %rdi +#define x %rsi + +// Main digits of intermediate results + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +// Quotient estimate = top of product + 1 + +#define d %rdx +#define h %rdx +#define q %rdx + +// Other temporary variables and their short version + +#define a %rax +#define c %rcx + +#define ashort %eax +#define qshort %edx + +S2N_BN_SYMBOL(bignum_triple_p256k1_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// First do the multiplication by 3, getting z = [h; d3; ...; d0] +// but immediately form the quotient estimate q = h + 1 + + movq $3, c + + movq (x), a + mulq c + movq a, d0 + movq d, d1 + + movq 8(x), a + xorq d2, d2 + mulq c + addq a, d1 + adcq d, d2 + + movq 16(x), a + mulq c + addq a, d2 + adcq $0, d + + movq 24(x), a + movq d, d3 + mulq c + addq a, d3 + adcq $1, h + +// For this limited range a simple quotient estimate of q = h + 1 works, where +// h = floor(z / 2^256). Then -p_256k1 <= z - q * p_256k1 < p_256k1. +// Initial subtraction of z - q * p_256k1, actually by adding q * 4294968273. + + movq $4294968273, c + xorq a, a + imulq c, q + addq q, d0 + adcq a, d1 + adcq a, d2 + adcq a, d3 + +// With z = 2^256 * h + l, the underlying result z' is actually +// (2^256 * h + l) - q * (2^256 - 4294968273) = (l + q * 4294968273) - 2^256 +// so carry-clear <=> z' is negative. Correct by subtracting in that case. +// In any case, write final result to z as we go. + + cmovcq a, c + + subq c, d0 + movq d0, (z) + sbbq a, d1 + movq d1, 8(z) + sbbq a, d2 + movq d2, 16(z) + sbbq a, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jadd.S new file mode 100644 index 00000000000..3237c0aa797 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jadd.S @@ -0,0 +1,425 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on SECG curve secp256k1 in Jacobian coordinates +// +// extern void secp256k1_jadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input points p1 and p2 are +// fully reduced mod p_256k1, that both z coordinates are nonzero and +// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents +// the same affine point as". +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jadd) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rbp = p2, +// all of which are maintained throughout the code. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) +#define z_2 (2*NUMSIZE)(%rbp) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define x1a (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define y1a (NUMSIZE*6)(%rsp) + +#define NSPACE (NUMSIZE*7) + +// Corresponds exactly to bignum_mul_p256k1 + +#define mul_p256k1(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + xorl %ecx, %ecx ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rcx, %r13 ; \ + xorl %ecx, %ecx ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcxq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcxq %rcx, %r15 ; \ + movabs $0x1000003d1, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + adcxq %rcx, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + mulxq %rax, %rax, %rbx ; \ + addq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + cmovbq %rcx, %rdx ; \ + subq %rdx, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Corresponds exactly to bignum_sqr_p256k1 + +#define sqr_p256k1(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ebx, %ebx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rbx, %r13 ; \ + adoxq %rbx, %r14 ; \ + adcq %rbx, %r14 ; \ + xorl %ebx, %ebx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rbx, %r15 ; \ + adoxq %rbx, %r15 ; \ + movabs $0x1000003d1, %rdx ; \ + xorl %ebx, %ebx ; \ + mulxq %r12, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq %r13, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq %r14, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + adcxq %rbx, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + mulxq %rax, %rax, %rcx ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rdx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Corresponds exactly to bignum_sub_p256k1 + +#define sub_p256k1(P0,P1,P2) \ + xorl %eax, %eax ; \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + sbbq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + sbbq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + sbbq 0x18+P2, %r11 ; \ + movabs $0x1000003d1, %rcx ; \ + cmovae %rax, %rcx ; \ + subq %rcx, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rax, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq %rax, %r11 ; \ + movq %r11, 0x18+P0 + +// Additional macros to help with final multiplexing + +#define load4(r0,r1,r2,r3,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 + +#define store4(P,r0,r1,r2,r3) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P + +#define czload4(r0,r1,r2,r3,P) \ + cmovzq P, r0 ; \ + cmovzq 8+P, r1 ; \ + cmovzq 16+P, r2 ; \ + cmovzq 24+P, r3 + +#define muxload4(r0,r1,r2,r3,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 + +S2N_BN_SYMBOL(secp256k1_jadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input y in %rbp where it stays + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdx, %rbp + +// Main code, just a sequence of basic field operations + + sqr_p256k1(z1sq,z_1) + sqr_p256k1(z2sq,z_2) + + mul_p256k1(y1a,z_2,y_1) + mul_p256k1(y2a,z_1,y_2) + + mul_p256k1(x2a,z1sq,x_2) + mul_p256k1(x1a,z2sq,x_1) + mul_p256k1(y2a,z1sq,y2a) + mul_p256k1(y1a,z2sq,y1a) + + sub_p256k1(xd,x2a,x1a) + sub_p256k1(yd,y2a,y1a) + + sqr_p256k1(zz,xd) + sqr_p256k1(ww,yd) + + mul_p256k1(zzx1,zz,x1a) + mul_p256k1(zzx2,zz,x2a) + + sub_p256k1(resx,ww,zzx1) + sub_p256k1(t1,zzx2,zzx1) + + mul_p256k1(xd,xd,z_1) + + sub_p256k1(resx,resx,zzx2) + + sub_p256k1(t2,zzx1,resx) + + mul_p256k1(t1,t1,y1a) + mul_p256k1(resz,xd,z_2) + mul_p256k1(t2,yd,t2) + + sub_p256k1(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) + + load4(%r8,%r9,%r10,%r11,z_1) + + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + + load4(%r12,%r13,%r14,%r15,z_2) + + movq %r12, %rbx + movq %r13, %rdx + orq %r14, %rbx + orq %r15, %rdx + orq %rdx, %rbx + negq %rbx + sbbq %rbx, %rbx + + cmpq %rax, %rbx + +// Multiplex the outputs accordingly, re-using the z's in registers + + cmovbq %r8, %r12 + cmovbq %r9, %r13 + cmovbq %r10, %r14 + cmovbq %r11, %r15 + + czload4(%r12,%r13,%r14,%r15,resz) + + muxload4(%rax,%rbx,%rcx,%rdx,resx,x_1,x_2) + muxload4(%r8,%r9,%r10,%r11,resy,y_1,y_2) + +// Finally store back the multiplexed values + + store4(x_3,%rax,%rbx,%rcx,%rdx) + store4(y_3,%r8,%r9,%r10,%r11) + store4(z_3,%r12,%r13,%r14,%r15) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jadd_alt.S new file mode 100644 index 00000000000..abafb033c6d --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jadd_alt.S @@ -0,0 +1,506 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on SECG curve secp256k1 in Jacobian coordinates +// +// extern void secp256k1_jadd_alt +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input points p1 and p2 are +// fully reduced mod p_256k1, that both z coordinates are nonzero and +// that neither p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents +// the same affine point as". +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jadd_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rbp = p2, +// all of which are maintained throughout the code. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) +#define z_2 (2*NUMSIZE)(%rbp) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define x1a (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define y1a (NUMSIZE*6)(%rsp) + +#define NSPACE (NUMSIZE*7) + +// Corresponds to bignum_mul_p256k1_alt except %rsi -> %rbx + +#define mul_p256k1(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq $0x1000003d1, %rbx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rbx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Corresponds to bignum_sqr_p256k1_alt except for %rsi -> %rbx + +#define sqr_p256k1(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r14 ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r15 ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq $0x1000003d1, %rbx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rbx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Corresponds exactly to bignum_sub_p256k1 + +#define sub_p256k1(P0,P1,P2) \ + xorl %eax, %eax ; \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + sbbq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + sbbq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + sbbq 0x18+P2, %r11 ; \ + movabs $0x1000003d1, %rcx ; \ + cmovae %rax, %rcx ; \ + subq %rcx, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rax, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq %rax, %r11 ; \ + movq %r11, 0x18+P0 + +// Additional macros to help with final multiplexing + +#define load4(r0,r1,r2,r3,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 + +#define store4(P,r0,r1,r2,r3) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P + +#define czload4(r0,r1,r2,r3,P) \ + cmovzq P, r0 ; \ + cmovzq 8+P, r1 ; \ + cmovzq 16+P, r2 ; \ + cmovzq 24+P, r3 + +#define muxload4(r0,r1,r2,r3,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 + +S2N_BN_SYMBOL(secp256k1_jadd_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input y in %rbp where it stays + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdx, %rbp + +// Main code, just a sequence of basic field operations + + sqr_p256k1(z1sq,z_1) + sqr_p256k1(z2sq,z_2) + + mul_p256k1(y1a,z_2,y_1) + mul_p256k1(y2a,z_1,y_2) + + mul_p256k1(x2a,z1sq,x_2) + mul_p256k1(x1a,z2sq,x_1) + mul_p256k1(y2a,z1sq,y2a) + mul_p256k1(y1a,z2sq,y1a) + + sub_p256k1(xd,x2a,x1a) + sub_p256k1(yd,y2a,y1a) + + sqr_p256k1(zz,xd) + sqr_p256k1(ww,yd) + + mul_p256k1(zzx1,zz,x1a) + mul_p256k1(zzx2,zz,x2a) + + sub_p256k1(resx,ww,zzx1) + sub_p256k1(t1,zzx2,zzx1) + + mul_p256k1(xd,xd,z_1) + + sub_p256k1(resx,resx,zzx2) + + sub_p256k1(t2,zzx1,resx) + + mul_p256k1(t1,t1,y1a) + mul_p256k1(resz,xd,z_2) + mul_p256k1(t2,yd,t2) + + sub_p256k1(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) + + load4(%r8,%r9,%r10,%r11,z_1) + + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + + load4(%r12,%r13,%r14,%r15,z_2) + + movq %r12, %rbx + movq %r13, %rdx + orq %r14, %rbx + orq %r15, %rdx + orq %rdx, %rbx + negq %rbx + sbbq %rbx, %rbx + + cmpq %rax, %rbx + +// Multiplex the outputs accordingly, re-using the z's in registers + + cmovbq %r8, %r12 + cmovbq %r9, %r13 + cmovbq %r10, %r14 + cmovbq %r11, %r15 + + czload4(%r12,%r13,%r14,%r15,resz) + + muxload4(%rax,%rbx,%rcx,%rdx,resx,x_1,x_2) + muxload4(%r8,%r9,%r10,%r11,resy,y_1,y_2) + +// Finally store back the multiplexed values + + store4(x_3,%rax,%rbx,%rcx,%rdx) + store4(y_3,%r8,%r9,%r10,%r11) + store4(z_3,%r12,%r13,%r14,%r15) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jdouble.S new file mode 100644 index 00000000000..acecec83ca9 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jdouble.S @@ -0,0 +1,619 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on SECG curve secp256k1 in Jacobian coordinates +// +// extern void secp256k1_montjdouble +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input point are fully +// reduced mod p_256k1 and that the z coordinate is not zero. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jdouble) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1, which is true when the +// arguments come in initially and is not disturbed throughout. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define x_2 (NUMSIZE*0)(%rsp) +#define y_2 (NUMSIZE*1)(%rsp) +#define d (NUMSIZE*2)(%rsp) +#define tmp (NUMSIZE*3)(%rsp) +#define x_4 (NUMSIZE*4)(%rsp) +#define y_4 (NUMSIZE*6)(%rsp) +#define dx2 (NUMSIZE*8)(%rsp) +#define xy2 (NUMSIZE*10)(%rsp) + +#define NSPACE (NUMSIZE*12) + +// Corresponds exactly to bignum_mul_p256k1 + +#define mul_p256k1(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + xorl %ecx, %ecx ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rcx, %r13 ; \ + xorl %ecx, %ecx ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcxq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcxq %rcx, %r15 ; \ + movabsq $0x1000003d1, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + adcxq %rcx, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + mulxq %rax, %rax, %rbx ; \ + addq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + cmovbq %rcx, %rdx ; \ + subq %rdx, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Corresponds exactly to bignum_sqr_p256k1 + +#define sqr_p256k1(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ebx, %ebx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rbx, %r13 ; \ + adoxq %rbx, %r14 ; \ + adcq %rbx, %r14 ; \ + xorl %ebx, %ebx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rbx, %r15 ; \ + adoxq %rbx, %r15 ; \ + movabsq $0x1000003d1, %rdx ; \ + xorl %ebx, %ebx ; \ + mulxq %r12, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq %r13, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq %r14, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + adcxq %rbx, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + mulxq %rax, %rax, %rcx ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rdx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Rough versions producing 5-word results + +#define roughmul_p256k1(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + xorl %ecx, %ecx ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rcx, %r13 ; \ + xorl %ecx, %ecx ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcxq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcxq %rcx, %r15 ; \ + movabsq $0x1000003d1, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + adcxq %rcx, %r12 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 ; \ + movq %r12, 0x20+P0 + +#define roughsqr_p256k1(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ebx, %ebx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rbx, %r13 ; \ + adoxq %rbx, %r14 ; \ + adcq %rbx, %r14 ; \ + xorl %ebx, %ebx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rbx, %r15 ; \ + adoxq %rbx, %r15 ; \ + movabsq $0x1000003d1, %rdx ; \ + xorl %ebx, %ebx ; \ + mulxq %r12, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq %r13, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq %r14, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + adcxq %rbx, %r12 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 ; \ + movq %r12, 0x20+P0 + +// Weak doubling operation, staying in 4 digits but not in general +// fully normalizing + +#define weakdouble_p256k1(P0,P1) \ + movq 24+P1, %r11 ; \ + movq 16+P1, %r10 ; \ + movq $0x1000003d1, %rax ; \ + xorq %rdx, %rdx ; \ + shldq $1, %r10, %r11 ; \ + cmovncq %rdx, %rax ; \ + movq 8+P1, %r9 ; \ + shldq $1, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $1, %r8, %r9 ; \ + shlq $1, %r8 ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %rdx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %r11, 24+P0 + +// P0 = C * P1 - D * P2 with 5-word inputs P1 and P2 +// Only used here with C = 12, D = 9, but could be used more generally. +// We actually compute C * P1 + D * (2^33 * p_256k1 - P2) + +#define cmsub_p256k1(P0,C,P1,D,P2) \ + movq $0xfffff85e00000000, %r8 ; \ + subq P2, %r8 ; \ + movq $0xfffffffffffffffd, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq $0xffffffffffffffff, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq $0xffffffffffffffff, %r11 ; \ + sbbq 24+P2, %r11 ; \ + movq $0x00000001ffffffff, %r12 ; \ + sbbq 32+P2, %r12 ; \ + movq $D, %rdx ; \ + mulxq %r8, %r8, %rax ; \ + mulxq %r9, %r9, %rcx ; \ + addq %rax, %r9 ; \ + mulxq %r10, %r10, %rax ; \ + adcq %rcx, %r10 ; \ + mulxq %r11, %r11, %rcx ; \ + adcq %rax, %r11 ; \ + mulxq %r12, %r12, %rax ; \ + adcq %rcx, %r12 ; \ + movq $C, %rdx ; \ + xorq %rbx, %rbx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq 8+P1, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq 16+P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 24+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + mulxq 32+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movq $0x1000003d1, %rcx ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + cmovbq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rbx, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq %rbx, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq %rbx, %r11 ; \ + movq %r11, 24+P0 ; \ + +// P0 = 3 * P1 - 8 * P2 with 5-digit P1 and P2 +// We actually compute 3 * P1 + (2^33 * p_256k1 - P2) << 3 + +#define cmsub38_p256k1(P0,P1,P2) \ + movq $0xfffff85e00000000, %r8 ; \ + subq P2, %r8 ; \ + movq $0xfffffffffffffffd, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq $0xffffffffffffffff, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq $0xffffffffffffffff, %r11 ; \ + sbbq 24+P2, %r11 ; \ + movq $0x00000001ffffffff, %r12 ; \ + sbbq 32+P2, %r12 ; \ + shldq $3, %r11, %r12 ; \ + shldq $3, %r10, %r11 ; \ + shldq $3, %r9, %r10 ; \ + shldq $3, %r8, %r9 ; \ + shlq $3, %r8 ; \ + movq $3, %rdx ; \ + xorq %rbx, %rbx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq 8+P1, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq 16+P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 24+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + mulxq 32+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movq $0x1000003d1, %rcx ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + cmovbq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rbx, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq %rbx, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq %rbx, %r11 ; \ + movq %r11, 24+P0 ; \ + +// P0 = 4 * P1 - P2 with 5-digit P1, 4-digit P2 and result. +// This is done by direct subtraction of P2 since the method +// in bignum_cmul_p256k1 etc. for quotient estimation still +// works when the value to be reduced is negative, as +// long as it is > -p_256k1, which is the case here. + +#define cmsub41_p256k1(P0,P1,P2) \ + movq 32+P1, %r12 ; \ + movq 24+P1, %r11 ; \ + shldq $2, %r11, %r12 ; \ + movq 16+P1, %r10 ; \ + shldq $2, %r10, %r11 ; \ + movq 8+P1, %r9 ; \ + shldq $2, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $2, %r8, %r9 ; \ + shlq $2, %r8 ; \ + subq P2, %r8 ; \ + sbbq 8+P2, %r9 ; \ + sbbq 16+P2, %r10 ; \ + sbbq 24+P2, %r11 ; \ + sbbq $0, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movq $0x1000003d1, %rcx ; \ + mulq %rcx; \ + xorq %rbx, %rbx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + adcq $0x0, %r11 ; \ + cmovbq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rbx, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq %rbx, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq %rbx, %r11 ; \ + movq %r11, 24+P0 ; \ + +S2N_BN_SYMBOL(secp256k1_jdouble): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main sequence of operations + + // y_2 = y^2 + + sqr_p256k1(y_2,y_1) + + // x_2 = x^2 + + sqr_p256k1(x_2,x_1) + + // tmp = 2 * y_1 (in 4 words but not fully normalized) + + weakdouble_p256k1(tmp,y_1) + + // xy2 = x * y^2 (5-digit partially reduced) + // x_4 = x^4 (5-digit partially reduced) + + roughmul_p256k1(xy2,x_1,y_2) + roughsqr_p256k1(x_4,x_2) + + // z_3 = 2 * y_1 * z_1 + + mul_p256k1(z_3,z_1,tmp) + + // d = 12 * xy2 - 9 * x_4 + + cmsub_p256k1(d,12,xy2,9,x_4) + + // y4 = y2^2 (5-digit partially reduced) + + roughsqr_p256k1(y_4,y_2) + + // dx2 = d * x_2 (5-digit partially reduced) + + roughmul_p256k1(dx2,x_2,d) + + // x_3 = 4 * xy2 - d + + cmsub41_p256k1(x_3,xy2,d) + + // y_3 = 3 * dx2 - 8 * y_4 + + cmsub38_p256k1(y_3,dx2,y_4) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jdouble_alt.S new file mode 100644 index 00000000000..1452f4a3a93 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jdouble_alt.S @@ -0,0 +1,813 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on SECG curve secp256k1 in Jacobian coordinates +// +// extern void secp256k1_montjdouble_alt +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// It is assumed that all coordinates of the input point are fully +// reduced mod p_256k1 and that the z coordinate is not zero. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jdouble_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1, which is true when the +// arguments come in initially and is not disturbed throughout. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define x_2 (NUMSIZE*0)(%rsp) +#define y_2 (NUMSIZE*1)(%rsp) +#define d (NUMSIZE*2)(%rsp) +#define tmp (NUMSIZE*3)(%rsp) +#define x_4 (NUMSIZE*4)(%rsp) +#define y_4 (NUMSIZE*6)(%rsp) +#define dx2 (NUMSIZE*8)(%rsp) +#define xy2 (NUMSIZE*10)(%rsp) + +#define NSPACE (NUMSIZE*12) + +// Corresponds to bignum_mul_p256k1_alt except %rsi -> %rbx + +#define mul_p256k1(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq $0x1000003d1, %rbx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rbx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Corresponds to bignum_sqr_p256k1_alt except for %rsi -> %rbx + +#define sqr_p256k1(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r14 ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r15 ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq $0x1000003d1, %rbx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rbx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Rough versions producing 5-word results + +#define roughmul_p256k1(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq $0x1000003d1, %rbx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 ; \ + movq %r12, 0x20+P0 + +#define roughsqr_p256k1(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r14 ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r15 ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq $0x1000003d1, %rbx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 ; \ + movq %r12, 0x20+P0 + +// Weak doubling operation, staying in 4 digits but not in general +// fully normalizing + +#define weakdouble_p256k1(P0,P1) \ + movq 24+P1, %r11 ; \ + movq 16+P1, %r10 ; \ + movq $0x1000003d1, %rax ; \ + xorq %rdx, %rdx ; \ + shldq $1, %r10, %r11 ; \ + cmovncq %rdx, %rax ; \ + movq 8+P1, %r9 ; \ + shldq $1, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $1, %r8, %r9 ; \ + shlq $1, %r8 ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %rdx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 8+P0 ; \ + movq %r10, 16+P0 ; \ + movq %r11, 24+P0 + +// P0 = C * P1 - D * P2 with 5-word inputs P1 and P2 +// Only used here with C = 12, D = 9, but could be used more generally. +// We actually compute C * P1 + D * (2^33 * p_256k1 - P2) + +#define cmsub_p256k1(P0,C,P1,D,P2) \ + movq $0xfffff85e00000000, %r9 ; \ + subq P2, %r9 ; \ + movq $0xfffffffffffffffd, %r10 ; \ + sbbq 8+P2, %r10 ; \ + movq $0xffffffffffffffff, %r11 ; \ + sbbq 16+P2, %r11 ; \ + movq $0xffffffffffffffff, %r12 ; \ + sbbq 24+P2, %r12 ; \ + movq $0x00000001ffffffff, %r13 ; \ + sbbq 32+P2, %r13 ; \ + movq $D, %rcx ; \ + movq %r9, %rax ; \ + mulq %rcx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq %r10, %rax ; \ + xorl %r10d, %r10d ; \ + mulq %rcx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq %r11, %rax ; \ + xorl %r11d, %r11d ; \ + mulq %rcx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq %r12, %rax ; \ + xorl %r12d, %r12d ; \ + mulq %rcx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + imulq %r13, %rcx ; \ + addq %rcx, %r12 ; \ + movq $C, %rcx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbx, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbx, %rbx ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbx, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movq $0x1000003d1, %rcx ; \ + mulq %rcx; \ + xorl %ebx, %ebx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + cmovbq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rbx, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq %rbx, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq %rbx, %r11 ; \ + movq %r11, 24+P0 ; \ + +// P0 = 3 * P1 - 8 * P2 with 5-digit P1 and P2 +// We actually compute 3 * P1 + (2^33 * p_256k1 - P2) << 3 + +#define cmsub38_p256k1(P0,P1,P2) \ + movq $0xfffff85e00000000, %r8 ; \ + subq P2, %r8 ; \ + movq $0xfffffffffffffffd, %r9 ; \ + sbbq 8+P2, %r9 ; \ + movq $0xffffffffffffffff, %r10 ; \ + sbbq 16+P2, %r10 ; \ + movq $0xffffffffffffffff, %r11 ; \ + sbbq 24+P2, %r11 ; \ + movq $0x00000001ffffffff, %r12 ; \ + sbbq 32+P2, %r12 ; \ + shldq $3, %r11, %r12 ; \ + shldq $3, %r10, %r11 ; \ + shldq $3, %r9, %r10 ; \ + shldq $3, %r8, %r9 ; \ + shlq $3, %r8 ; \ + movl $3, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbx, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbx, %rbx ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbx, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + movq 0x20+P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movq $0x1000003d1, %rcx ; \ + mulq %rcx; \ + xorl %ebx, %ebx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + cmovbq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rbx, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq %rbx, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq %rbx, %r11 ; \ + movq %r11, 24+P0 ; \ + +// P0 = 4 * P1 - P2 with 5-digit P1, 4-digit P2 and result. +// This is done by direct subtraction of P2 since the method +// in bignum_cmul_p256k1 etc. for quotient estimation still +// works when the value to be reduced is negative, as +// long as it is > -p_256k1, which is the case here. + +#define cmsub41_p256k1(P0,P1,P2) \ + movq 32+P1, %r12 ; \ + movq 24+P1, %r11 ; \ + shldq $2, %r11, %r12 ; \ + movq 16+P1, %r10 ; \ + shldq $2, %r10, %r11 ; \ + movq 8+P1, %r9 ; \ + shldq $2, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $2, %r8, %r9 ; \ + shlq $2, %r8 ; \ + subq P2, %r8 ; \ + sbbq 8+P2, %r9 ; \ + sbbq 16+P2, %r10 ; \ + sbbq 24+P2, %r11 ; \ + sbbq $0, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + movq $0x1000003d1, %rcx ; \ + mulq %rcx; \ + xorq %rbx, %rbx ; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq $0x0, %r10 ; \ + adcq $0x0, %r11 ; \ + cmovbq %rbx, %rcx ; \ + subq %rcx, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rbx, %r9 ; \ + movq %r9, 8+P0 ; \ + sbbq %rbx, %r10 ; \ + movq %r10, 16+P0 ; \ + sbbq %rbx, %r11 ; \ + movq %r11, 24+P0 ; \ + +S2N_BN_SYMBOL(secp256k1_jdouble_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main sequence of operations + + // y_2 = y^2 + + sqr_p256k1(y_2,y_1) + + // x_2 = x^2 + + sqr_p256k1(x_2,x_1) + + // tmp = 2 * y_1 (in 4 words but not fully normalized) + + weakdouble_p256k1(tmp,y_1) + + // xy2 = x * y^2 (5-digit partially reduced) + // x_4 = x^4 (5-digit partially reduced) + + roughmul_p256k1(xy2,x_1,y_2) + roughsqr_p256k1(x_4,x_2) + + // z_3 = 2 * y_1 * z_1 + + mul_p256k1(z_3,z_1,tmp) + + // d = 12 * xy2 - 9 * x_4 + + cmsub_p256k1(d,12,xy2,9,x_4) + + // y4 = y2^2 (5-digit partially reduced) + + roughsqr_p256k1(y_4,y_2) + + // dx2 = d * x_2 (5-digit partially reduced) + + roughmul_p256k1(dx2,x_2,d) + + // x_3 = 4 * xy2 - d + + cmsub41_p256k1(x_3,xy2,d) + + // y_3 = 3 * dx2 - 8 * y_4 + + cmsub38_p256k1(y_3,dx2,y_4) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jmixadd.S new file mode 100644 index 00000000000..561b645e4dc --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jmixadd.S @@ -0,0 +1,397 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on SECG curve secp256k1 in Jacobian coordinates +// +// extern void secp256k1_jmixadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. It is assumed that +// all the coordinates of the input points p1 and p2 are fully reduced +// mod p_256k1, that the z coordinate of p1 is nonzero and that neither +// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine +// point as". +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jmixadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jmixadd) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rbp = p2, +// all of which are maintained throughout the code. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) +#define z_2 (2*NUMSIZE)(%rbp) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define NSPACE (NUMSIZE*6) + +// Corresponds exactly to bignum_mul_p256k1 + +#define mul_p256k1(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + xorl %ecx, %ecx ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rcx, %r13 ; \ + xorl %ecx, %ecx ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcxq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcxq %rcx, %r15 ; \ + movabs $0x1000003d1, %rdx ; \ + xorl %ecx, %ecx ; \ + mulxq %r12, %rax, %rbx ; \ + adcxq %rax, %r8 ; \ + adoxq %rbx, %r9 ; \ + mulxq %r13, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq %r14, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + adcxq %rcx, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + mulxq %rax, %rax, %rbx ; \ + addq %rax, %r8 ; \ + adcq %rbx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + cmovbq %rcx, %rdx ; \ + subq %rdx, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Corresponds exactly to bignum_sqr_p256k1 + +#define sqr_p256k1(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ebx, %ebx ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r11 ; \ + adoxq %rcx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rbx, %r13 ; \ + adoxq %rbx, %r14 ; \ + adcq %rbx, %r14 ; \ + xorl %ebx, %ebx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rbx, %r15 ; \ + adoxq %rbx, %r15 ; \ + movabs $0x1000003d1, %rdx ; \ + xorl %ebx, %ebx ; \ + mulxq %r12, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq %r13, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq %r14, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq %r15, %rax, %r12 ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + adcxq %rbx, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + mulxq %rax, %rax, %rcx ; \ + addq %rax, %r8 ; \ + adcq %rcx, %r9 ; \ + adcq %rbx, %r10 ; \ + adcq %rbx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rdx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rbx, %r9 ; \ + sbbq %rbx, %r10 ; \ + sbbq %rbx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Corresponds exactly to bignum_sub_p256k1 + +#define sub_p256k1(P0,P1,P2) \ + xorl %eax, %eax ; \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + sbbq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + sbbq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + sbbq 0x18+P2, %r11 ; \ + movabs $0x1000003d1, %rcx ; \ + cmovae %rax, %rcx ; \ + subq %rcx, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rax, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq %rax, %r11 ; \ + movq %r11, 0x18+P0 + +// Additional macros to help with final multiplexing + +#define testzero4(P) \ + movq P, %rax ; \ + movq 8+P, %rdx ; \ + orq 16+P, %rax ; \ + orq 24+P, %rdx ; \ + orq %rdx, %rax + +#define mux4(r0,r1,r2,r3,PNE,PEQ) \ + movq PNE, r0 ; \ + movq PEQ, %rax ; \ + cmovzq %rax, r0 ; \ + movq 8+PNE, r1 ; \ + movq 8+PEQ, %rax ; \ + cmovzq %rax, r1 ; \ + movq 16+PNE, r2 ; \ + movq 16+PEQ, %rax ; \ + cmovzq %rax, r2 ; \ + movq 24+PNE, r3 ; \ + movq 24+PEQ, %rax ; \ + cmovzq %rax, r3 + +#define load4(r0,r1,r2,r3,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 + +#define store4(P,r0,r1,r2,r3) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P + +S2N_BN_SYMBOL(secp256k1_jmixadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input y in %rbp where it stays + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdx, %rbp + +// Main code, just a sequence of basic field operations + + sqr_p256k1(zp2,z_1) + + mul_p256k1(y2a,z_1,y_2) + mul_p256k1(x2a,zp2,x_2) + mul_p256k1(y2a,zp2,y2a) + + sub_p256k1(xd,x2a,x_1) + + sub_p256k1(yd,y2a,y_1) + + sqr_p256k1(zz,xd) + sqr_p256k1(ww,yd) + + mul_p256k1(zzx1,zz,x_1) + mul_p256k1(zzx2,zz,x2a) + + sub_p256k1(resx,ww,zzx1) + sub_p256k1(t1,zzx2,zzx1) + + mul_p256k1(resz,xd,z_1) + + sub_p256k1(resx,resx,zzx2) + + sub_p256k1(t2,zzx1,resx) + + mul_p256k1(t1,t1,y_1) + mul_p256k1(t2,yd,t2) + + sub_p256k1(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + testzero4(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. + + mux4(%r8,%r9,%r10,%r11,resx,x_2) + mux4(%r12,%r13,%r14,%r15,resy,y_2) + + store4(x_3,%r8,%r9,%r10,%r11) + store4(y_3,%r12,%r13,%r14,%r15) + + load4(%r8,%r9,%r10,%r11,resz) + movl $1, %eax + cmovzq %rax, %r8 + movl $0, %eax + cmovzq %rax, %r9 + cmovzq %rax, %r10 + cmovzq %rax, %r11 + + store4(z_3,%r8,%r9,%r10,%r11) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jmixadd_alt.S new file mode 100644 index 00000000000..8e91773c638 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/secp256k1/secp256k1_jmixadd_alt.S @@ -0,0 +1,478 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on SECG curve secp256k1 in Jacobian coordinates +// +// extern void secp256k1_jmixadd_alt +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples. +// A Jacobian triple (x,y,z) represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. It is assumed that +// all the coordinates of the input points p1 and p2 are fully reduced +// mod p_256k1, that the z coordinate of p1 is nonzero and that neither +// p1 =~= p2 or p1 =~= -p2, where "=~=" means "represents the same affine +// point as". +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(secp256k1_jmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(secp256k1_jmixadd_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rbp = p2, +// all of which are maintained throughout the code. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) +#define z_2 (2*NUMSIZE)(%rbp) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define NSPACE (NUMSIZE*6) + +// Corresponds to bignum_mul_p256k1_alt except %rsi -> %rbx + +#define mul_p256k1(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq $0x1000003d1, %rbx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rbx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Corresponds to bignum_sqr_p256k1_alt except for %rsi -> %rbx + +#define sqr_p256k1(P0,P1) \ + movq P1, %rax ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r11 ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq $0x0, %r11 ; \ + xorq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r12 ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r13 ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r14 ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P1; \ + addq %rax, %rax ; \ + adcq %rdx, %rdx ; \ + adcq $0x0, %r15 ; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq $0x1000003d1, %rbx ; \ + movq %r12, %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rcx, %rcx ; \ + movq %r13, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rcx, %rcx ; \ + movq %r14, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq %r15, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + xorq %rcx, %rcx ; \ + addq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + adcq %rcx, %r12 ; \ + leaq 0x1(%r12), %rax ; \ + mulq %rbx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + adcq %rcx, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rax, %rax ; \ + notq %rax; \ + andq %rbx, %rax ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rcx, %r11 ; \ + movq %r8, P0 ; \ + movq %r9, 0x8+P0 ; \ + movq %r10, 0x10+P0 ; \ + movq %r11, 0x18+P0 + +// Corresponds exactly to bignum_sub_p256k1 + +#define sub_p256k1(P0,P1,P2) \ + xorl %eax, %eax ; \ + movq P1, %r8 ; \ + subq P2, %r8 ; \ + movq 0x8+P1, %r9 ; \ + sbbq 0x8+P2, %r9 ; \ + movq 0x10+P1, %r10 ; \ + sbbq 0x10+P2, %r10 ; \ + movq 0x18+P1, %r11 ; \ + sbbq 0x18+P2, %r11 ; \ + movabs $0x1000003d1, %rcx ; \ + cmovae %rax, %rcx ; \ + subq %rcx, %r8 ; \ + movq %r8, P0 ; \ + sbbq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + sbbq %rax, %r10 ; \ + movq %r10, 0x10+P0 ; \ + sbbq %rax, %r11 ; \ + movq %r11, 0x18+P0 + +// Additional macros to help with final multiplexing + +#define testzero4(P) \ + movq P, %rax ; \ + movq 8+P, %rdx ; \ + orq 16+P, %rax ; \ + orq 24+P, %rdx ; \ + orq %rdx, %rax + +#define mux4(r0,r1,r2,r3,PNE,PEQ) \ + movq PNE, r0 ; \ + movq PEQ, %rax ; \ + cmovzq %rax, r0 ; \ + movq 8+PNE, r1 ; \ + movq 8+PEQ, %rax ; \ + cmovzq %rax, r1 ; \ + movq 16+PNE, r2 ; \ + movq 16+PEQ, %rax ; \ + cmovzq %rax, r2 ; \ + movq 24+PNE, r3 ; \ + movq 24+PEQ, %rax ; \ + cmovzq %rax, r3 + +#define load4(r0,r1,r2,r3,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 + +#define store4(P,r0,r1,r2,r3) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P + +S2N_BN_SYMBOL(secp256k1_jmixadd_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input y in %rbp where it stays + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdx, %rbp + +// Main code, just a sequence of basic field operations + + sqr_p256k1(zp2,z_1) + + mul_p256k1(y2a,z_1,y_2) + mul_p256k1(x2a,zp2,x_2) + mul_p256k1(y2a,zp2,y2a) + + sub_p256k1(xd,x2a,x_1) + + sub_p256k1(yd,y2a,y_1) + + sqr_p256k1(zz,xd) + sqr_p256k1(ww,yd) + + mul_p256k1(zzx1,zz,x_1) + mul_p256k1(zzx2,zz,x2a) + + sub_p256k1(resx,ww,zzx1) + sub_p256k1(t1,zzx2,zzx1) + + mul_p256k1(resz,xd,z_1) + + sub_p256k1(resx,resx,zzx2) + + sub_p256k1(t2,zzx1,resx) + + mul_p256k1(t1,t1,y_1) + mul_p256k1(t2,yd,t2) + + sub_p256k1(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + testzero4(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with an extra z = 1 +// coordinate, hence giving 0 + p2 = p2 for the final result. + + mux4(%r8,%r9,%r10,%r11,resx,x_2) + mux4(%r12,%r13,%r14,%r15,resy,y_2) + + store4(x_3,%r8,%r9,%r10,%r11) + store4(y_3,%r12,%r13,%r14,%r15) + + load4(%r8,%r9,%r10,%r11,resz) + movl $1, %eax + cmovzq %rax, %r8 + movl $0, %eax + cmovzq %rax, %r9 + cmovzq %rax, %r10 + cmovzq %rax, %r11 + + store4(z_3,%r8,%r9,%r10,%r11) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_add_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_add_sm2.S new file mode 100644 index 00000000000..3edff95c3e1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_add_sm2.S @@ -0,0 +1,100 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Add modulo p_sm2, z := (x + y) mod p_sm2, assuming x and y reduced +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_add_sm2 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_add_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_add_sm2) + .text + +#define z %rdi +#define x %rsi +#define y %rdx + +#define d0 %rax +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n1 %r10 +#define n3 %rdx +#define c %r11 + +#define n1short %r10d + + + +S2N_BN_SYMBOL(bignum_add_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Load and add the two inputs as 2^256 * c + [d3;d2;d1;d0] = x + y + + xorq c, c + movq (x), d0 + addq (y), d0 + movq 8(x), d1 + adcq 8(y), d1 + movq 16(x), d2 + adcq 16(y), d2 + movq 24(x), d3 + adcq 24(y), d3 + adcq c, c + +// Now subtract 2^256 * c + [d3;d3;d1;d1] = x + y - p_sm2 +// The constants n1 and n3 in [n3; 0; n1; -1] = p_sm2 are saved for later + + subq $-1, d0 + movq $0xffffffff00000000, n1 + sbbq n1, d1 + sbbq $-1, d2 + movq $0xfffffffeffffffff, n3 + sbbq n3, d3 + +// Since by hypothesis x < p_sm2 we know x + y - p_sm2 < 2^256, so the top +// carry c actually gives us a bitmask for x + y - p_sm2 < 0, which we +// now use to make a masked p_sm2' = [n3; 0; n1; c] + + sbbq $0, c + andq c, n1 + andq c, n3 + +// Do the corrective addition and copy to output + + addq c, d0 + movq d0, (z) + adcq n1, d1 + movq d1, 8(z) + adcq c, d2 + movq d2, 16(z) + adcq n3, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_cmul_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_cmul_sm2.S new file mode 100644 index 00000000000..e4c2caf3869 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_cmul_sm2.S @@ -0,0 +1,133 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming +// x reduced +// Inputs c, x[4]; output z[4] +// +// extern void bignum_cmul_sm2 +// (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = c, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_sm2) + .text + +#define z %rdi + +// Temporarily moved here for initial multiply +#define x %rcx +// Likewise this is thrown away after initial multiply +#define m %rdx + +#define a %rax +#define c %rcx + +#define d0 %rsi +#define d1 %r8 +#define d2 %r9 +#define d3 %r10 +#define h %r11 + +// Multiplier again for second stage +#define q %rdx +#define qshort %edx + +S2N_BN_SYMBOL(bignum_cmul_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Shuffle inputs (since we want multiplier in %rdx) + + movq %rdx, x + movq %rsi, m + +// Multiply, accumulating the result as ca = 2^256 * h + [d3;d2;d1;d0] + + mulxq (x), d0, d1 + mulxq 8(x), a, d2 + addq a, d1 + mulxq 16(x), a, d3 + adcq a, d2 + mulxq 24(x), a, h + adcq a, d3 + adcq $0, h + +// Quotient approximation is (h * (1 + 2^32 + 2^64) + d3 + 2^64) >> 64. +// Note that by hypothesis our product is <= (2^64 - 1) * (p_sm2 - 1), +// so there is no need to max this out to avoid wrapping, unlike in the +// more general case of bignum_mod_sm2. + + movq d3, a + movl $1, qshort + addq h, a + adcq h, q + + shrq $32, a + addq h, a + + shrq $32, a + addq a, q + +// Now compute the initial pre-reduced [h;d3;d2;d1;d0] = ca - p_sm2 * q +// = ca - (2^256 - 2^224 - 2^96 + 2^64 - 1) * q + + movq q, a + movq q, c + shlq $32, a + shrq $32, c + + addq a, d3 + adcq c, h + + subq q, a + sbbq $0, c + + subq q, h + + addq q, d0 + adcq a, d1 + adcq c, d2 + adcq $0, d3 + adcq $0, h + +// Now our top word h is either zero or all 1s, and we use this to discriminate +// whether a correction is needed because our result is negative, as a bitmask +// Do a masked addition of p_sm2 + + movq $0xffffffff00000000, a + andq h, a + movq $0xfffffffeffffffff, c + andq h, c + addq h, d0 + movq d0, (z) + adcq a, d1 + movq d1, 8(z) + adcq h, d2 + movq d2, 16(z) + adcq c, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_cmul_sm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_cmul_sm2_alt.S new file mode 100644 index 00000000000..770d83e9ed5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_cmul_sm2_alt.S @@ -0,0 +1,150 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Multiply by a single word modulo p_sm2, z := (c * x) mod p_sm2, assuming +// x reduced +// Inputs c, x[4]; output z[4] +// +// extern void bignum_cmul_sm2_alt +// (uint64_t z[static 4], uint64_t c, uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = c, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = c, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_cmul_sm2_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_cmul_sm2_alt) + .text + +#define z %rdi + +// Temporarily moved here for initial multiply then thrown away + +#define x %rcx +#define m %rsi + +// Other variables + +#define d %rdx +#define a %rax +#define c %rcx + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 +#define h %rsi + +#define hshort %esi + +// Multiplier again for second stage +#define q %rdx +#define qshort %edx + +S2N_BN_SYMBOL(bignum_cmul_sm2_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Shuffle inputs (since we want %rdx for the high parts of products) + + movq %rdx, x + +// Multiply, accumulating the result as ca = 2^256 * h + [d3;d2;d1;d0] + + movq (x), a + mulq m + movq a, d0 + movq d, d1 + + movq 8(x), a + mulq m + xorq d2, d2 + addq a, d1 + adcq d, d2 + + movq 16(x), a + mulq m + xorq d3, d3 + addq a, d2 + adcq d, d3 + + movq 24(x), a + mulq m + xorl hshort, hshort + addq a, d3 + adcq d, h + +// Quotient approximation is (h * (1 + 2^32 + 2^64) + d3 + 2^64) >> 64. +// Note that by hypothesis our product is <= (2^64 - 1) * (p_sm2 - 1), +// so there is no need to max this out to avoid wrapping, unlike in the +// more general case of bignum_mod_sm2. + + movq d3, a + movl $1, qshort + addq h, a + adcq h, q + + shrq $32, a + addq h, a + shrq $32, a + addq a, q + +// Now compute the initial pre-reduced [h;d3;d2;d1;d0] = ca - p_sm2 * q +// = ca - (2^256 - 2^224 - 2^96 + 2^64 - 1) * q + + movq q, a + movq q, c + shlq $32, a + shrq $32, c + + addq a, d3 + adcq c, h + + subq q, a + sbbq $0, c + + subq q, h + + addq q, d0 + adcq a, d1 + adcq c, d2 + adcq $0, d3 + adcq $0, h + +// Now our top word h is either zero or all 1s, and we use this to discriminate +// whether a correction is needed because our result is negative, as a bitmask +// Do a masked addition of p_sm2 + + movq $0xffffffff00000000, a + andq h, a + movq $0xfffffffeffffffff, c + andq h, c + addq h, d0 + movq d0, (z) + adcq a, d1 + movq d1, 8(z) + adcq h, d2 + movq d2, 16(z) + adcq c, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_deamont_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_deamont_sm2.S new file mode 100644 index 00000000000..fa0bc34eed5 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_deamont_sm2.S @@ -0,0 +1,119 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from almost-Montgomery form, z := (x / 2^256) mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_deamont_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Convert a 4-digit bignum x out of its (optionally almost) Montgomery form, +// "almost" meaning any 4-digit input will work, with no range restriction. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_deamont_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_deamont_sm2) + .text + +#define z %rdi +#define x %rsi + +#define c %rcx +#define n1 %rax +#define n3 %rdx + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d0;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1], and using %rax, %rcx, %rdx and %rsi +// as temporaries. +// --------------------------------------------------------------------------- + +#define montreds(d3,d2,d1,d0) \ + movq d0, %rax ; \ + shlq $32, %rax ; \ + movq d0, %rcx ; \ + shrq $32, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rsi ; \ + subq d0, %rax ; \ + sbbq $0, %rcx ; \ + subq %rax, d1 ; \ + sbbq %rcx, d2 ; \ + sbbq %rdx, d3 ; \ + sbbq %rsi, d0 + +S2N_BN_SYMBOL(bignum_deamont_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Set up an initial 4-word window [%r11,%r10,%r9,%r8] = x + + movq (x), %r8 + movq 8(x), %r9 + movq 16(x), %r10 + movq 24(x), %r11 + +// Systematically scroll left doing 1-step reductions. This process +// keeps things inside 4 digits (i.e. < 2^256) at each stage, since +// we have w * p_sm2 + x <= (2^64 - 1) * p_sm2 + (2 EXP 256 - 1) +// <= (2^64 - 1) * (2^256 - 1) + (2 EXP 256 - 1) <= 2^64 * (2^256 - 1) + + montreds(%r11,%r10,%r9,%r8) + + montreds(%r8,%r11,%r10,%r9) + + montreds(%r9,%r8,%r11,%r10) + + montreds(%r10,%r9,%r8,%r11) + +// Let [%r11;%r10;%r9;%r8] := [%r11;%r10;%r9;%r8] - p_sm2, saving constants +// n1 and n3 in [n3; -1; n1; -1] = p_sm2 for later use. + + subq $-1, %r8 + movq $0xffffffff00000000, n1 + sbbq n1, %r9 + sbbq $-1, %r10 + movq $0xfffffffeffffffff, n3 + sbbq n3, %r11 + +// Capture the carry to determine whether to add back p_sm2, and use +// it to create a masked p_sm2' = [n3; c; n1; c] + + sbbq c, c + andq c, n1 + andq c, n3 + +// Do the corrective addition and copy to output + + addq c, %r8 + movq %r8, (z) + adcq n1, %r9 + movq %r9, 8(z) + adcq c, %r10 + movq %r10, 16(z) + adcq n3, %r11 + movq %r11, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_demont_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_demont_sm2.S new file mode 100644 index 00000000000..360a4e50811 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_demont_sm2.S @@ -0,0 +1,93 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert from Montgomery form z := (x / 2^256) mod p_sm2, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_demont_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// This assumes the input is < p_sm2 for correctness. If this is not the case, +// use the variant "bignum_deamont_sm2" instead. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_demont_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_demont_sm2) + .text + +#define z %rdi +#define x %rsi + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d0;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1], and using %rax, %rcx, %rdx and %rsi +// as temporaries. +// --------------------------------------------------------------------------- + +#define montreds(d3,d2,d1,d0) \ + movq d0, %rax ; \ + shlq $32, %rax ; \ + movq d0, %rcx ; \ + shrq $32, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rsi ; \ + subq d0, %rax ; \ + sbbq $0, %rcx ; \ + subq %rax, d1 ; \ + sbbq %rcx, d2 ; \ + sbbq %rdx, d3 ; \ + sbbq %rsi, d0 + +S2N_BN_SYMBOL(bignum_demont_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Set up an initial 4-word window [%r11,%r10,%r9,%r8] = x + + movq (x), %r8 + movq 8(x), %r9 + movq 16(x), %r10 + movq 24(x), %r11 + +// Systematically scroll left doing 1-step reductions. This process +// keeps things reduced < p_sm2 at each stage, since we have +// w * p_sm2 + x <= (2^64 - 1) * p_sm2 + (p_sm2 - 1) < 2^64 * p_sm2 + + montreds(%r11,%r10,%r9,%r8) + + montreds(%r8,%r11,%r10,%r9) + + montreds(%r9,%r8,%r11,%r10) + + montreds(%r10,%r9,%r8,%r11) + +// Write back result + + movq %r8, (z) + movq %r9, 8(z) + movq %r10, 16(z) + movq %r11, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_double_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_double_sm2.S new file mode 100644 index 00000000000..857a0675308 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_double_sm2.S @@ -0,0 +1,97 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Double modulo p_sm2, z := (2 * x) mod p_sm2, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_double_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_double_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_double_sm2) + .text + +#define z %rdi +#define x %rsi + +#define d0 %rdx +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n1 %r10 +#define n3 %r11 +#define c %rax + +#define n1short %r10d + +S2N_BN_SYMBOL(bignum_double_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the input and double it so that 2^256 * c + [d3;d2;d1;d0] = 2 * x +// Could also consider using shld to decouple carries + + xorq c, c + movq (x), d0 + addq d0, d0 + movq 8(x), d1 + adcq d1, d1 + movq 16(x), d2 + adcq d2, d2 + movq 24(x), d3 + adcq d3, d3 + adcq c, c + +// Now subtract 2^256 * c + [d3;d3;d1;d1] = 2 * x - p_sm2 +// The constants n1 and n3 in [n3; -1; n1; -1] = p_sm2 are saved for later + + subq $-1, d0 + movq $0xffffffff00000000, n1 + sbbq n1, d1 + sbbq $-1, d2 + movq $0xfffffffeffffffff, n3 + sbbq n3, d3 + +// Since by hypothesis x < p_sm2 we know 2 * x - p_sm2 < 2^256, so the top +// carry c actually gives us a bitmask for 2 * x - p_sm2 < 0, which we +// now use to make a masked p_sm2' = [n3; c; n1; c] + + sbbq $0, c + andq c, n1 + andq c, n3 + +// Do the corrective addition and copy to output + + addq c, d0 + movq d0, (z) + adcq n1, d1 + movq d1, 8(z) + adcq c, d2 + movq d2, 16(z) + adcq n3, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_half_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_half_sm2.S new file mode 100644 index 00000000000..b2502942b69 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_half_sm2.S @@ -0,0 +1,89 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Halve modulo p_sm2, z := (x / 2) mod p_sm2, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_half_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_half_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_half_sm2) + .text + +#define z %rdi +#define x %rsi + +#define a %rax +#define d0 %rcx +#define d1 %rdx +#define d2 %r8 +#define d3 %r9 + +#define d0short %ecx +#define d1short %edx + +S2N_BN_SYMBOL(bignum_half_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load lowest digit and get a mask for its lowest bit in d0 + + movq (x), a + movl $1, d0short + andq a, d0 + negq d0 + +// Create a masked version of p_sm2 + + movq $0xffffffff00000000, d1 + andq d0, d1 + movq d0, d2 + movq $0xfffffffeffffffff, d3 + andq d0, d3 + +// Perform addition with masked p_sm2. Catch the carry in a, as a bitmask +// for convenience though we only use its LSB below with SHRD + + addq a, d0 + adcq 8(x), d1 + adcq 16(x), d2 + adcq 24(x), d3 + sbbq a, a + +// Shift right, pushing the carry back down, and store back + + shrdq $1, d1, d0 + movq d0, (z) + shrdq $1, d2, d1 + movq d1, 8(z) + shrdq $1, d3, d2 + movq d2, 16(z) + shrdq $1, a, d3 + movq d3, 24(z) + +// Return + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_inv_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_inv_sm2.S new file mode 100644 index 00000000000..dffa018b221 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_inv_sm2.S @@ -0,0 +1,1629 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1 +// Input x[4]; output z[4] +// +// extern void bignum_inv_sm2(uint64_t z[static 4],uint64_t x[static 4]); +// +// If the 4-digit input x is coprime to p_sm2, i.e. is not divisible +// by it, returns z < p_sm2 such that x * z == 1 (mod p_sm2). Note that +// x does not need to be reduced modulo p_sm2, but the output always is. +// If the input is divisible (i.e. is 0 or p_sm2), then there can be no +// modular inverse and z = 0 is returned. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_sm2) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define f 0(%rsp) +#define g (5*N)(%rsp) +#define u (10*N)(%rsp) +#define v (15*N)(%rsp) +#define tmp (20*N)(%rsp) +#define tmp2 (21*N)(%rsp) +#define i (22*N)(%rsp) +#define d (23*N)(%rsp) + +#define mat (24*N)(%rsp) + +// Backup for the input pointer + +#define res (28*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (30*N) + +// Syntactic variants to make x86_att version simpler to generate + +#define F 0 +#define G (5*N) +#define U (10*N) +#define V (15*N) +#define MAT (24*N) + +#define ff (%rsp) +#define gg (5*N)(%rsp) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro from u[4..0] to u[3..0]. +// --------------------------------------------------------------------------- + +#define amontred(P) \ +/* We only know the input is -2^316 < x < 2^316. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_sm2. */ \ + movq $0xe000000000000000, %r8 ; \ + addq P, %r8 ; \ + movq $0x1fffffffffffffff, %r9 ; \ + adcq 8+P, %r9 ; \ + movq $0xffffffffe0000000, %r10 ; \ + adcq 16+P, %r10 ; \ + movq $0xffffffffffffffff, %r11 ; \ + adcq 24+P, %r11 ; \ + movq $0x1fffffffdfffffff, %r12 ; \ + adcq 32+P, %r12 ; \ +/* Let [%rcx;%rbx] = 2^32 * d0 and [%rdx;%rax] = (2^32-1) * d0 */ \ + movq %r8, %rbx ; \ + movq %r8, %rcx ; \ + shrq $32, %rcx ; \ + shlq $32, %rbx ; \ + movl $0xffffffff, %eax ; \ + mulq %r8; \ +/* Now [%r12;%r11;%r10;%r9] := [%r8;%r11;%r10;%r9] - [%rcx;%rbx;%rdx;%rax] */ \ + subq %rax, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq %rcx, %r8 ; \ + addq %r8, %r12 ; \ +/* Now capture carry and subtract p_sm2 if set (almost-Montgomery) */ \ + sbbq %rax, %rax ; \ + movl $0xffffffff, %ebx ; \ + notq %rbx; \ + andq %rax, %rbx ; \ + movq %rax, %rdx ; \ + btr $32, %rdx ; \ + subq %rax, %r9 ; \ + movq %r9, P ; \ + sbbq %rbx, %r10 ; \ + movq %r10, 8+P ; \ + sbbq %rax, %r11 ; \ + movq %r11, 16+P ; \ + sbbq %rdx, %r12 ; \ + movq %r12, 24+P + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix as +// +// [ %r8 %r10] +// [ %r12 %r14] +// +// and also returning the matrix still negated (which doesn't matter) + +#define divstep59(din,fin,gin) \ + movq din, %rsi ; \ + movq fin, %rdx ; \ + movq gin, %rcx ; \ + movq %rdx, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + xorl %ebp, %ebp ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %rdx ; \ + leaq (%rcx,%rax), %rdi ; \ + shlq $0x16, %rdx ; \ + shlq $0x16, %rdi ; \ + sarq $0x2b, %rdx ; \ + sarq $0x2b, %rdi ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %rbx ; \ + leaq (%rcx,%rax), %rcx ; \ + sarq $0x2a, %rbx ; \ + sarq $0x2a, %rcx ; \ + movq %rdx, MAT(%rsp) ; \ + movq %rbx, MAT+0x8(%rsp) ; \ + movq %rdi, MAT+0x10(%rsp) ; \ + movq %rcx, MAT+0x18(%rsp) ; \ + movq fin, %r12 ; \ + imulq %r12, %rdi ; \ + imulq %rdx, %r12 ; \ + movq gin, %r13 ; \ + imulq %r13, %rbx ; \ + imulq %rcx, %r13 ; \ + addq %rbx, %r12 ; \ + addq %rdi, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r10 ; \ + shlq $0x16, %r8 ; \ + shlq $0x16, %r10 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r10 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r15 ; \ + leaq (%rcx,%rax), %r11 ; \ + sarq $0x2a, %r15 ; \ + sarq $0x2a, %r11 ; \ + movq %r13, %rbx ; \ + movq %r12, %rcx ; \ + imulq %r8, %r12 ; \ + imulq %r15, %rbx ; \ + addq %rbx, %r12 ; \ + imulq %r11, %r13 ; \ + imulq %r10, %rcx ; \ + addq %rcx, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq MAT(%rsp), %rax ; \ + imulq %r8, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r15, %rdx ; \ + imulq MAT+0x8(%rsp), %r8 ; \ + imulq MAT+0x18(%rsp), %r15 ; \ + addq %r8, %r15 ; \ + leaq (%rax,%rdx), %r9 ; \ + movq MAT(%rsp), %rax ; \ + imulq %r10, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r11, %rdx ; \ + imulq MAT+0x8(%rsp), %r10 ; \ + imulq MAT+0x18(%rsp), %r11 ; \ + addq %r10, %r11 ; \ + leaq (%rax,%rdx), %r13 ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r12 ; \ + shlq $0x15, %r8 ; \ + shlq $0x15, %r12 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r12 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r10 ; \ + leaq (%rcx,%rax), %r14 ; \ + sarq $0x2b, %r10 ; \ + sarq $0x2b, %r14 ; \ + movq %r9, %rax ; \ + imulq %r8, %rax ; \ + movq %r13, %rdx ; \ + imulq %r10, %rdx ; \ + imulq %r15, %r8 ; \ + imulq %r11, %r10 ; \ + addq %r8, %r10 ; \ + leaq (%rax,%rdx), %r8 ; \ + movq %r9, %rax ; \ + imulq %r12, %rax ; \ + movq %r13, %rdx ; \ + imulq %r14, %rdx ; \ + imulq %r15, %r12 ; \ + imulq %r11, %r14 ; \ + addq %r12, %r14 ; \ + leaq (%rax,%rdx), %r12 + +S2N_BN_SYMBOL(bignum_inv_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Create constant [%rdx;%rcx;%rbx;%rax] = p_sm2 and copy it into the variable f +// including the 5th zero digit + + xorl %ebp, %ebp + leaq -1(%rbp), %rax + movl $0x00000000ffffffff, %ebx + notq %rbx + movq %rax, %rcx + movq %rax, %rdx + btr $32, %rdx + + movq %rax, F(%rsp) + movq %rbx, F+8(%rsp) + movq %rcx, F+16(%rsp) + movq %rdx, F+24(%rsp) + movq %rbp, F+32(%rsp) + +// Now reduce the input modulo p_sm2, first negating the constant to get +// [%rdx;%rcx;%rbx;%rax] = 2^256 - p_sm2, adding it to x and hence getting +// the comparison x < p_sm2 <=> (2^256 - p_sm2) + x < 2^256 and choosing +// g accordingly. + + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + + movl $1, %eax + notq %rbx + xorl %ecx, %ecx + notq %rdx + + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovncq %r8, %rax + cmovncq %r9, %rbx + cmovncq %r10, %rcx + cmovncq %r11, %rdx + + movq %rax, G(%rsp) + movq %rbx, G+8(%rsp) + movq %rcx, G+16(%rsp) + movq %rdx, G+24(%rsp) + xorl %eax, %eax + movq %rax, G+32(%rsp) + +// Also maintain reduced < 2^256 vector [u,v] such that +// [f,g] == x * 2^{5*i-50} * [u,v] (mod p_sm2) +// starting with [p_sm2,x] == x * 2^{5*0-50} * [0,2^50] (mod p_sm2) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. + + xorl %eax, %eax + movq %rax, U(%rsp) + movq %rax, U+8(%rsp) + movq %rax, U+16(%rsp) + movq %rax, U+24(%rsp) + + movq $0x0004000000000000, %rcx + movq %rcx, V(%rsp) + movq %rax, V+8(%rsp) + movq %rax, V+16(%rsp) + movq %rax, V+24(%rsp) + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special tenth iteration after a uniform +// first 9. + + movq $10, i + movq $1, d + jmp bignum_inv_sm2_midloop + +bignum_inv_sm2_loop: + +// Separate out the matrix into sign-magnitude pairs + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in temporary storage for the [u,v] part and do [f,g] first. + + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, tmp + + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, tmp2 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + xorl %ebx, %ebx + movq F(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq G(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq F(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq G(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + +// Digit 1 of [f,g] + + xorl %ecx, %ecx + movq F+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq G+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $59, %rbx, %rdi + movq %rdi, F(%rsp) + + xorl %edi, %edi + movq F+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq G+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $59, %rbp, %rsi + movq %rsi, G(%rsp) + +// Digit 2 of [f,g] + + xorl %esi, %esi + movq F+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq G+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $59, %rcx, %rbx + movq %rbx, F+N(%rsp) + + xorl %ebx, %ebx + movq F+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq G+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $59, %rdi, %rbp + movq %rbp, G+N(%rsp) + +// Digits 3 and 4 of [f,g] + + movq F+3*N(%rsp), %rax + xorq %r9, %rax + movq F+4*N(%rsp), %rbp + xorq %r9, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq G+3*N(%rsp), %rax + xorq %r11, %rax + movq G+4*N(%rsp), %rdx + xorq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $59, %rsi, %rcx + movq %rcx, F+2*N(%rsp) + shrdq $59, %rbp, %rsi + sarq $59, %rbp + + movq F+3*N(%rsp), %rax + movq %rsi, F+3*N(%rsp) + + movq F+4*N(%rsp), %rsi + movq %rbp, F+4*N(%rsp) + + xorq %r13, %rax + xorq %r13, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq G+3*N(%rsp), %rax + xorq %r15, %rax + movq G+4*N(%rsp), %rdx + xorq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $59, %rbx, %rdi + movq %rdi, G+2*N(%rsp) + shrdq $59, %rsi, %rbx + movq %rbx, G+3*N(%rsp) + sarq $59, %rsi + movq %rsi, G+4*N(%rsp) + +// Get the initial carries back from storage and do the [u,v] accumulation + + movq tmp, %rbx + movq tmp2, %rbp + +// Digit 0 of [u,v] + + xorl %ecx, %ecx + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V(%rsp) + +// Digit 1 of [u,v] + + xorl %ebx, %ebx + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+N(%rsp) + +// Digit 2 of [u,v] + + xorl %ecx, %ecx + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U+2*N(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V+2*N(%rsp) + +// Digits 3 and 4 of u (top is unsigned) + + movq U+3*N(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+3*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + +// Preload for last use of old u digit 3 + + movq U+3*N(%rsp), %rax + movq %rcx, U+3*N(%rsp) + movq %rdx, U+4*N(%rsp) + +// Digits 3 and 4 of v (top is unsigned) + + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq V+3*N(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rsi, V+3*N(%rsp) + movq %rdx, V+4*N(%rsp) + +// Montgomery reduction of u + + amontred(u) + +// Montgomery reduction of v + + amontred(v) + +bignum_inv_sm2_midloop: + + divstep59(d,ff,gg) + movq %rsi, d + +// Next iteration + + decq i + jnz bignum_inv_sm2_loop + +// The 10th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + movq F(%rsp), %rax + movq G(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $63, %rax + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * [u,v] (mod p_sm2) +// we want to flip the sign of u according to that of f. + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + +// Adjust the initial value to allow for complement instead of negation + + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + +// Digit 0 of [u] + + xorl %r13d, %r13d + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + +// Digit 1 of [u] + + xorl %r14d, %r14d + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + +// Digit 2 of [u] + + xorl %r15d, %r15d + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + +// Digits 3 and 4 of u (top is unsigned) + + movq U+3*N(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq V+3*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + +// Store back and Montgomery reduce u + + movq %r12, U(%rsp) + movq %r13, U+N(%rsp) + movq %r14, U+2*N(%rsp) + movq %r15, U+3*N(%rsp) + movq %r9, U+4*N(%rsp) + + amontred(u) + +// Perform final strict reduction mod p_sm2 and copy to output + + movq U(%rsp), %r8 + movq U+N(%rsp), %r9 + movq U+2*N(%rsp), %r10 + movq U+3*N(%rsp), %r11 + + movl $1, %eax + movl $0x00000000ffffffff, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + bts $32, %rdx + + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovncq %r8, %rax + cmovncq %r9, %rbx + cmovncq %r10, %rcx + cmovncq %r11, %rdx + + movq res, %rdi + movq %rax, (%rdi) + movq %rbx, N(%rdi) + movq %rcx, 2*N(%rdi) + movq %rdx, 3*N(%rdi) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2.S new file mode 100644 index 00000000000..52ed311d316 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2.S @@ -0,0 +1,203 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod n_sm2 +// Input x[k]; output z[4] +// +// extern void bignum_mod_nsm2 +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Reduction is modulo the group order of the GM/T 0003-2012 curve SM2. +// +// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = k, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_nsm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_nsm2) + .text + +#define z %rdi +#define k %rsi +#define x %rcx + +#define m0 %r8 +#define m1 %r9 +#define m2 %r10 +#define m3 %r11 +#define d %r12 + +#define n0 %rax +#define n1 %rbx +#define n3 %rdx +#define q %rdx + +#define qshort %edx + +S2N_BN_SYMBOL(bignum_mod_nsm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save extra registers + + pushq %rbx + pushq %r12 + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmpq $4, k + jc bignum_mod_nsm2_shortinput + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 + + subq $4, k + movq 24(%rdx,k,8), m3 + movq 16(%rdx,k,8), m2 + movq 8(%rdx,k,8), m1 + movq (%rdx,k,8), m0 + +// Move x into another register to leave %rdx free for multiplies and use of n3 + + movq %rdx, x + +// Reduce the top 4 digits mod n_sm2 (a conditional subtraction of n_sm2) + + movq $0xac440bf6c62abedd, n0 + movq $0x8dfc2094de39fad4, n1 + movq $0x0000000100000000, n3 + + addq n0, m0 + adcq n1, m1 + adcq $0, m2 + adcq n3, m3 + sbbq d, d + notq d + andq d, n0 + andq d, n1 + andq d, n3 + subq n0, m0 + sbbq n1, m1 + sbbq $0, m2 + sbbq n3, m3 + +// Now do (k-4) iterations of 5->4 word modular reduction + + testq k, k + jz bignum_mod_nsm2_writeback + +bignum_mod_nsm2_loop: + +// Writing the input, with the new zeroth digit implicitly appended, as +// z = 2^256 * m3 + 2^192 * m2 + t, our intended quotient approximation is +// MIN ((m3 * (1 + 2^32 + 2^64) + m2 + 2^64) >> 64) (2^64 - 1) + + movq m2, d + movl $1, qshort + addq m3, d + adcq m3, q + + shrq $32, d + addq m3, d + + shrq $32, d + addq d, q + sbbq $0, q + +// Load the next digit so current m to reduce = [m3;m2;m1;m0;d] + + movq -8(x,k,8), d + +// Now form [m3;m2;m1;m0;d] = m - q * n_sm2 + + subq q, m3 + movq $0xac440bf6c62abedd, n0 + mulxq n0, n0, n1 + addq n0, d + adcq n1, m0 + movq $0x8dfc2094de39fad4, n0 + mulxq n0, n0, n1 + adcq $0, n1 + addq n0, m0 + adcq n1, m1 + movq $0x0000000100000000, n0 + mulxq n0, n0, n1 + adcq n0, m2 + adcq n1, m3 + +// Now our top word m3 is either zero or all 1s. Use it for a masked +// addition of n_sm2, which we can do by a *subtraction* of +// 2^256 - n_sm2 from our portion + + movq $0xac440bf6c62abedd, n0 + andq m3, n0 + movq $0x8dfc2094de39fad4, n1 + andq m3, n1 + movq $0x0000000100000000, n3 + andq m3, n3 + + subq n0, d + sbbq n1, m0 + sbbq $0, m1 + sbbq n3, m2 + +// Now shuffle registers up and loop + + movq m2, m3 + movq m1, m2 + movq m0, m1 + movq d, m0 + + decq k + jnz bignum_mod_nsm2_loop + +// Write back + +bignum_mod_nsm2_writeback: + + movq m0, (z) + movq m1, 8(z) + movq m2, 16(z) + movq m3, 24(z) + +// Restore registers and return + + popq %r12 + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +bignum_mod_nsm2_shortinput: + + xorq m0, m0 + xorq m1, m1 + xorq m2, m2 + xorq m3, m3 + + testq k, k + jz bignum_mod_nsm2_writeback + movq (%rdx), m0 + decq k + jz bignum_mod_nsm2_writeback + movq 8(%rdx), m1 + decq k + jz bignum_mod_nsm2_writeback + movq 16(%rdx), m2 + jmp bignum_mod_nsm2_writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2_4.S new file mode 100644 index 00000000000..e749f1298d7 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2_4.S @@ -0,0 +1,96 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod n_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_mod_nsm2_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Reduction is modulo the group order of the GM/T 0003-2012 curve SM2. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_nsm2_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_nsm2_4) + .text + +#define z %rdi +#define x %rsi + +#define d0 %rdx +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n0 %rax +#define n1 %r10 +#define n3 %r11 + +// Can re-use this as a temporary once we've loaded the input + +#define c %rsi + +S2N_BN_SYMBOL(bignum_mod_nsm2_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load a set of registers [n3; 0; n1; n0] = 2^256 - n_sm2 + + movq $0xac440bf6c62abedd, n0 + movq $0x8dfc2094de39fad4, n1 + movq $0x0000000100000000, n3 + +// Load the input and compute x + (2^256 - n_sm2) + + movq (x), d0 + addq n0, d0 + movq 8(x), d1 + adcq n1, d1 + movq 16(x), d2 + adcq $0, d2 + movq 24(x), d3 + adcq n3, d3 + +// Now CF is set iff 2^256 <= x + (2^256 - n_sm2), i.e. iff n_sm2 <= x. +// Create a mask for the condition x < n, and mask the three nontrivial digits +// ready to undo the previous addition with a compensating subtraction + + sbbq c, c + notq c + andq c, n0 + andq c, n1 + andq c, n3 + +// Now subtract mask * (2^256 - n_sm2) again and store + + subq n0, d0 + movq d0, (z) + sbbq n1, d1 + movq d1, 8(z) + sbbq $0, d2 + movq d2, 16(z) + sbbq n3, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2_alt.S new file mode 100644 index 00000000000..662f76459a1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_nsm2_alt.S @@ -0,0 +1,211 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo group order, z := x mod n_sm2 +// Input x[k]; output z[4] +// +// extern void bignum_mod_nsm2_alt +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Reduction is modulo the group order of the GM/T 0003-2012 curve SM2. +// +// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = k, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_nsm2_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_nsm2_alt) + .text + +#define z %rdi +#define k %rsi +#define x %rcx + +#define m0 %r8 +#define m1 %r9 +#define m2 %r10 +#define m3 %r11 +#define d %r12 + +#define n0 %rax +#define n1 %rbx +#define n3 %rdx + +#define q %rbx + +#define qshort %ebx + +S2N_BN_SYMBOL(bignum_mod_nsm2_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save extra registers + + pushq %rbx + pushq %r12 + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmpq $4, k + jc bignum_mod_nsm2_alt_shortinput + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 + + subq $4, k + movq 24(%rdx,k,8), m3 + movq 16(%rdx,k,8), m2 + movq 8(%rdx,k,8), m1 + movq (%rdx,k,8), m0 + +// Move x into another register to leave %rdx free for multiplies and use of n3 + + movq %rdx, x + +// Reduce the top 4 digits mod n_sm2 (a conditional subtraction of n_sm2) + + movq $0xac440bf6c62abedd, n0 + movq $0x8dfc2094de39fad4, n1 + movq $0x0000000100000000, n3 + + addq n0, m0 + adcq n1, m1 + adcq $0, m2 + adcq n3, m3 + sbbq d, d + notq d + andq d, n0 + andq d, n1 + andq d, n3 + subq n0, m0 + sbbq n1, m1 + sbbq $0, m2 + sbbq n3, m3 + +// Now do (k-4) iterations of 5->4 word modular reduction + + testq k, k + jz bignum_mod_nsm2_alt_writeback + +bignum_mod_nsm2_alt_loop: + +// Writing the input, with the new zeroth digit implicitly appended, as +// z = 2^256 * m3 + 2^192 * m2 + t, our intended quotient approximation is +// MIN ((m3 * (1 + 2^32 + 2^64) + m2 + 2^64) >> 64) (2^64 - 1) + + movq m2, d + movl $1, qshort + addq m3, d + adcq m3, q + + shrq $32, d + addq m3, d + + shrq $32, d + addq d, q + sbbq $0, q + +// Load the next digit so current m to reduce = [m3;m2;m1;m0;d] + + movq -8(x,k,8), d + +// Now form [m3;m2;m1;m0;d] = m - q * n_sm2 + + subq q, m3 + + movq $0xac440bf6c62abedd, %rax + mulq q + addq %rax, d + adcq %rdx, m0 + adcq $0, m1 + adcq $0, m2 + adcq $0, m3 + + movq $0x8dfc2094de39fad4, %rax + mulq q + addq %rax, m0 + adcq %rdx, m1 + adcq $0, m2 + adcq $0, m3 + + movq $0x0000000100000000, %rax + mulq q + addq %rax, m2 + adcq %rdx, m3 + +// Now our top word m3 is either zero or all 1s. Use it for a masked +// addition of n_sm2, which we can do by a *subtraction* of +// 2^256 - n_sm2 from our portion + + movq $0xac440bf6c62abedd, n0 + andq m3, n0 + movq $0x8dfc2094de39fad4, n1 + andq m3, n1 + movq $0x0000000100000000, n3 + andq m3, n3 + + subq n0, d + sbbq n1, m0 + sbbq $0, m1 + sbbq n3, m2 + +// Now shuffle registers up and loop + + movq m2, m3 + movq m1, m2 + movq m0, m1 + movq d, m0 + + decq k + jnz bignum_mod_nsm2_alt_loop + +// Write back + +bignum_mod_nsm2_alt_writeback: + + movq m0, (z) + movq m1, 8(z) + movq m2, 16(z) + movq m3, 24(z) + +// Restore registers and return + + popq %r12 + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +bignum_mod_nsm2_alt_shortinput: + + xorq m0, m0 + xorq m1, m1 + xorq m2, m2 + xorq m3, m3 + + testq k, k + jz bignum_mod_nsm2_alt_writeback + movq (%rdx), m0 + decq k + jz bignum_mod_nsm2_alt_writeback + movq 8(%rdx), m1 + decq k + jz bignum_mod_nsm2_alt_writeback + movq 16(%rdx), m2 + jmp bignum_mod_nsm2_alt_writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_sm2.S new file mode 100644 index 00000000000..27324a2a7de --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_sm2.S @@ -0,0 +1,198 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_sm2 +// Input x[k]; output z[4] +// +// extern void bignum_mod_sm2 +// (uint64_t z[static 4], uint64_t k, uint64_t *x); +// +// Standard x86-64 ABI: RDI = z, RSI = k, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = k, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_sm2) + .text + +#define z %rdi +#define k %rsi +#define x %rdx + +#define m0 %r8 +#define m1 %r9 +#define m2 %r10 +#define m3 %r11 +#define d %r12 + +#define n0 %rax +#define n1 %rbx +#define n3 %rcx +#define q %rcx + +#define qshort %ecx + +S2N_BN_SYMBOL(bignum_mod_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save extra registers + + pushq %rbx + pushq %r12 + +// If the input is already <= 3 words long, go to a trivial "copy" path + + cmpq $4, k + jc bignum_mod_sm2_shortinput + +// Otherwise load the top 4 digits (top-down) and reduce k by 4 + + subq $4, k + movq 24(x,k,8), m3 + movq 16(x,k,8), m2 + movq 8(x,k,8), m1 + movq (x,k,8), m0 + +// Load non-trivial digits [n3; -1; n1; -1] = p_sm2 and do a conditional +// subtraction to reduce the four starting digits [m3;m2;m1;m0] modulo p_sm2 + + subq $-1, m0 + movq $0xffffffff00000000, n1 + sbbq n1, m1 + movq $0xfffffffeffffffff, n3 + sbbq $-1, m2 + sbbq n3, m3 + + sbbq n0, n0 + + andq n0, n1 + andq n0, n3 + addq n0, m0 + adcq n1, m1 + adcq n0, m2 + adcq n3, m3 + +// Now do (k-4) iterations of 5->4 word modular reduction + + testq k, k + jz bignum_mod_sm2_writeback + +bignum_mod_sm2_loop: + +// Writing the input, with the new zeroth digit implicitly appended, as +// z = 2^256 * m3 + 2^192 * m2 + t, our intended quotient approximation is +// MIN ((m3 * (1 + 2^32 + 2^64) + m2 + 2^64) >> 64) (2^64 - 1) + + movq m2, d + movl $1, qshort + addq m3, d + adcq m3, q + + shrq $32, d + addq m3, d + + shrq $32, d + addq d, q + sbbq $0, q + +// Load the next digit so current m to reduce = [m3;m2;m1;m0;d] + + movq -8(x,k,8), d + +// Now compute the initial pre-reduced [m3;m2;m1;m0;d] = m - p_sm2 * q +// = z - (2^256 - 2^224 - 2^96 + 2^64 - 1) * q + + movq q, n0 + movq q, n1 + shlq $32, n0 + shrq $32, n1 + + addq n0, m2 + adcq n1, m3 + + subq q, n0 + sbbq $0, n1 + + subq q, m3 + + addq q, d + adcq n0, m0 + adcq n1, m1 + adcq $0, m2 + adcq $0, m3 + +// Now our top word m3 is either zero or all 1s, and we use this to discriminate +// whether a correction is needed because our result is negative, as a bitmask +// Do a masked addition of p_sm2 + + movq $0xffffffff00000000, n1 + andq m3, n1 + movq $0xfffffffeffffffff, n3 + andq m3, n3 + addq m3, d + adcq n1, m0 + adcq m3, m1 + adcq n3, m2 + +// Shuffle registers up and loop + + movq m2, m3 + movq m1, m2 + movq m0, m1 + movq d, m0 + + decq k + jnz bignum_mod_sm2_loop + +// Write back + +bignum_mod_sm2_writeback: + + movq m0, (z) + movq m1, 8(z) + movq m2, 16(z) + movq m3, 24(z) + +// Restore registers and return + + popq %r12 + popq %rbx +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +bignum_mod_sm2_shortinput: + + xorq m0, m0 + xorq m1, m1 + xorq m2, m2 + xorq m3, m3 + + testq k, k + jz bignum_mod_sm2_writeback + movq (%rdx), m0 + decq k + jz bignum_mod_sm2_writeback + movq 8(%rdx), m1 + decq k + jz bignum_mod_sm2_writeback + movq 16(%rdx), m2 + jmp bignum_mod_sm2_writeback + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_sm2_4.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_sm2_4.S new file mode 100644 index 00000000000..314a230c587 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_mod_sm2_4.S @@ -0,0 +1,84 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Reduce modulo field characteristic, z := x mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_mod_sm2_4 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mod_sm2_4) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mod_sm2_4) + .text + +#define z %rdi +#define x %rsi + +#define d0 %rdx +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n1 %r10 +#define n3 %r11 +#define c %rax + +S2N_BN_SYMBOL(bignum_mod_sm2_4): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the input and subtract to get [d3;d3;d1;d1] = x - p_sm2 (modulo 2^256) +// The constants n1 and n3 in [n3; -1; n1; -1] = p_sm2 are saved for later + + movq (x), d0 + subq $-1, d0 + movq 8(x), d1 + movq $0xffffffff00000000, n1 + sbbq n1, d1 + movq 16(x), d2 + sbbq $-1, d2 + movq $0xfffffffeffffffff, n3 + movq 24(x), d3 + sbbq n3, d3 + +// Capture the carry to determine whether to add back p_sm2, and use +// it to create a masked p_sm2' = [n3; c; n1; c] + + sbbq c, c + andq c, n1 + andq c, n3 + +// Do the corrective addition and copy to output + + addq c, d0 + movq d0, (z) + adcq n1, d1 + movq d1, 8(z) + adcq c, d2 + movq d2, 16(z) + adcq n3, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montinv_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montinv_sm2.S new file mode 100644 index 00000000000..0c0d2507726 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montinv_sm2.S @@ -0,0 +1,1640 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery inverse modulo p_sm2 = 2^256 - 2^224 - 2^96 + 2^64 - 1 +// Input x[4]; output z[4] +// +// extern void bignum_montinv_sm2(uint64_t z[static 4],uint64_t x[static 4]); +// +// If the 4-digit input x is coprime to p_sm2, i.e. is not divisible +// by it, returns z < p_sm2 such that x * z == 2^512 (mod p_sm2). This +// is effectively "Montgomery inverse" because if we consider x and z as +// Montgomery forms of X and Z, i.e. x == 2^256 * X and z == 2^256 * Z +// (both mod p_sm2) then X * Z == 1 (mod p_sm2). That is, this function +// gives the analog of the modular inverse bignum_inv_sm2 but with both +// input and output in the Montgomery domain. Note that x does not need +// to be reduced modulo p_sm2, but the output always is. If the input +// is divisible (i.e. is 0 or p_sm2), then there can be no solution to +// the congruence x * z == 2^512 (mod p_sm2), and z = 0 is returned. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_sm2) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define f 0(%rsp) +#define g (5*N)(%rsp) +#define u (10*N)(%rsp) +#define v (15*N)(%rsp) +#define tmp (20*N)(%rsp) +#define tmp2 (21*N)(%rsp) +#define i (22*N)(%rsp) +#define d (23*N)(%rsp) + +#define mat (24*N)(%rsp) + +// Backup for the input pointer + +#define res (28*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (30*N) + +// Syntactic variants to make x86_att version simpler to generate + +#define F 0 +#define G (5*N) +#define U (10*N) +#define V (15*N) +#define MAT (24*N) + +#define ff (%rsp) +#define gg (5*N)(%rsp) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro from u[4..0] to u[3..0]. +// --------------------------------------------------------------------------- + +#define amontred(P) \ +/* We only know the input is -2^316 < x < 2^316. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_sm2. */ \ + movq $0xe000000000000000, %r8 ; \ + addq P, %r8 ; \ + movq $0x1fffffffffffffff, %r9 ; \ + adcq 8+P, %r9 ; \ + movq $0xffffffffe0000000, %r10 ; \ + adcq 16+P, %r10 ; \ + movq $0xffffffffffffffff, %r11 ; \ + adcq 24+P, %r11 ; \ + movq $0x1fffffffdfffffff, %r12 ; \ + adcq 32+P, %r12 ; \ +/* Let [%rcx;%rbx] = 2^32 * d0 and [%rdx;%rax] = (2^32-1) * d0 */ \ + movq %r8, %rbx ; \ + movq %r8, %rcx ; \ + shrq $32, %rcx ; \ + shlq $32, %rbx ; \ + movl $0xffffffff, %eax ; \ + mulq %r8; \ +/* Now [%r12;%r11;%r10;%r9] := [%r8;%r11;%r10;%r9] - [%rcx;%rbx;%rdx;%rax] */ \ + subq %rax, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + sbbq %rcx, %r8 ; \ + addq %r8, %r12 ; \ +/* Now capture carry and subtract p_sm2 if set (almost-Montgomery) */ \ + sbbq %rax, %rax ; \ + movl $0xffffffff, %ebx ; \ + notq %rbx; \ + andq %rax, %rbx ; \ + movq %rax, %rdx ; \ + btr $32, %rdx ; \ + subq %rax, %r9 ; \ + movq %r9, P ; \ + sbbq %rbx, %r10 ; \ + movq %r10, 8+P ; \ + sbbq %rax, %r11 ; \ + movq %r11, 16+P ; \ + sbbq %rdx, %r12 ; \ + movq %r12, 24+P + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix as +// +// [ %r8 %r10] +// [ %r12 %r14] +// +// and also returning the matrix still negated (which doesn't matter) + +#define divstep59(din,fin,gin) \ + movq din, %rsi ; \ + movq fin, %rdx ; \ + movq gin, %rcx ; \ + movq %rdx, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + xorl %ebp, %ebp ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %rdx ; \ + leaq (%rcx,%rax), %rdi ; \ + shlq $0x16, %rdx ; \ + shlq $0x16, %rdi ; \ + sarq $0x2b, %rdx ; \ + sarq $0x2b, %rdi ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %rbx ; \ + leaq (%rcx,%rax), %rcx ; \ + sarq $0x2a, %rbx ; \ + sarq $0x2a, %rcx ; \ + movq %rdx, MAT(%rsp) ; \ + movq %rbx, MAT+0x8(%rsp) ; \ + movq %rdi, MAT+0x10(%rsp) ; \ + movq %rcx, MAT+0x18(%rsp) ; \ + movq fin, %r12 ; \ + imulq %r12, %rdi ; \ + imulq %rdx, %r12 ; \ + movq gin, %r13 ; \ + imulq %r13, %rbx ; \ + imulq %rcx, %r13 ; \ + addq %rbx, %r12 ; \ + addq %rdi, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r10 ; \ + shlq $0x16, %r8 ; \ + shlq $0x16, %r10 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r10 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r15 ; \ + leaq (%rcx,%rax), %r11 ; \ + sarq $0x2a, %r15 ; \ + sarq $0x2a, %r11 ; \ + movq %r13, %rbx ; \ + movq %r12, %rcx ; \ + imulq %r8, %r12 ; \ + imulq %r15, %rbx ; \ + addq %rbx, %r12 ; \ + imulq %r11, %r13 ; \ + imulq %r10, %rcx ; \ + addq %rcx, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq MAT(%rsp), %rax ; \ + imulq %r8, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r15, %rdx ; \ + imulq MAT+0x8(%rsp), %r8 ; \ + imulq MAT+0x18(%rsp), %r15 ; \ + addq %r8, %r15 ; \ + leaq (%rax,%rdx), %r9 ; \ + movq MAT(%rsp), %rax ; \ + imulq %r10, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r11, %rdx ; \ + imulq MAT+0x8(%rsp), %r10 ; \ + imulq MAT+0x18(%rsp), %r11 ; \ + addq %r10, %r11 ; \ + leaq (%rax,%rdx), %r13 ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r12 ; \ + shlq $0x15, %r8 ; \ + shlq $0x15, %r12 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r12 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r10 ; \ + leaq (%rcx,%rax), %r14 ; \ + sarq $0x2b, %r10 ; \ + sarq $0x2b, %r14 ; \ + movq %r9, %rax ; \ + imulq %r8, %rax ; \ + movq %r13, %rdx ; \ + imulq %r10, %rdx ; \ + imulq %r15, %r8 ; \ + imulq %r11, %r10 ; \ + addq %r8, %r10 ; \ + leaq (%rax,%rdx), %r8 ; \ + movq %r9, %rax ; \ + imulq %r12, %rax ; \ + movq %r13, %rdx ; \ + imulq %r14, %rdx ; \ + imulq %r15, %r12 ; \ + imulq %r11, %r14 ; \ + addq %r12, %r14 ; \ + leaq (%rax,%rdx), %r12 + +S2N_BN_SYMBOL(bignum_montinv_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Create constant [%rdx;%rcx;%rbx;%rax] = p_sm2 and copy it into the variable f +// including the 5th zero digit + + xorl %ebp, %ebp + leaq -1(%rbp), %rax + movl $0x00000000ffffffff, %ebx + notq %rbx + movq %rax, %rcx + movq %rax, %rdx + btr $32, %rdx + + movq %rax, F(%rsp) + movq %rbx, F+8(%rsp) + movq %rcx, F+16(%rsp) + movq %rdx, F+24(%rsp) + movq %rbp, F+32(%rsp) + +// Now reduce the input modulo p_sm2, first negating the constant to get +// [%rdx;%rcx;%rbx;%rax] = 2^256 - p_sm2, adding it to x and hence getting +// the comparison x < p_sm2 <=> (2^256 - p_sm2) + x < 2^256 and choosing +// g accordingly. + + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + + movl $1, %eax + notq %rbx + xorl %ecx, %ecx + notq %rdx + + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovncq %r8, %rax + cmovncq %r9, %rbx + cmovncq %r10, %rcx + cmovncq %r11, %rdx + + movq %rax, G(%rsp) + movq %rbx, G+8(%rsp) + movq %rcx, G+16(%rsp) + movq %rdx, G+24(%rsp) + xorl %eax, %eax + movq %rax, G+32(%rsp) + +// Also maintain reduced < 2^256 vector [u,v] such that +// [f,g] == x * 2^{5*i-562} * [u,v] (mod p_sm2) +// starting with [p_sm2,x] == x * 2^{5*0-562} * [0,2^562] (mod p_sm2) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. +// After the 10th and last iteration and sign adjustment, when +// f == 1 for in-scope cases, we have x * 2^{50-562} * u == 1, i.e. +// x * u == 2^512 as required. + + xorl %eax, %eax + movq %rax, U(%rsp) + movq %rax, U+8(%rsp) + movq %rax, U+16(%rsp) + movq %rax, U+24(%rsp) + + movq $0x000c000000100000, %rax + movq %rax, V(%rsp) + movq $0x000bfffffff80000, %rax + movq %rax, V+8(%rsp) + movq $0x00040000000c0000, %rax + movq %rax, V+16(%rsp) + movq $0x0018000000040000, %rax + movq %rax, V+24(%rsp) + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special tenth iteration after a uniform +// first 9. + + movq $10, i + movq $1, d + jmp bignum_montinv_sm2_midloop + +bignum_montinv_sm2_loop: + +// Separate out the matrix into sign-magnitude pairs + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in temporary storage for the [u,v] part and do [f,g] first. + + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, tmp + + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, tmp2 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + xorl %ebx, %ebx + movq F(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq G(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq F(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq G(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + +// Digit 1 of [f,g] + + xorl %ecx, %ecx + movq F+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq G+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $59, %rbx, %rdi + movq %rdi, F(%rsp) + + xorl %edi, %edi + movq F+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq G+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $59, %rbp, %rsi + movq %rsi, G(%rsp) + +// Digit 2 of [f,g] + + xorl %esi, %esi + movq F+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq G+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $59, %rcx, %rbx + movq %rbx, F+N(%rsp) + + xorl %ebx, %ebx + movq F+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq G+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $59, %rdi, %rbp + movq %rbp, G+N(%rsp) + +// Digits 3 and 4 of [f,g] + + movq F+3*N(%rsp), %rax + xorq %r9, %rax + movq F+4*N(%rsp), %rbp + xorq %r9, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq G+3*N(%rsp), %rax + xorq %r11, %rax + movq G+4*N(%rsp), %rdx + xorq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $59, %rsi, %rcx + movq %rcx, F+2*N(%rsp) + shrdq $59, %rbp, %rsi + sarq $59, %rbp + + movq F+3*N(%rsp), %rax + movq %rsi, F+3*N(%rsp) + + movq F+4*N(%rsp), %rsi + movq %rbp, F+4*N(%rsp) + + xorq %r13, %rax + xorq %r13, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq G+3*N(%rsp), %rax + xorq %r15, %rax + movq G+4*N(%rsp), %rdx + xorq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $59, %rbx, %rdi + movq %rdi, G+2*N(%rsp) + shrdq $59, %rsi, %rbx + movq %rbx, G+3*N(%rsp) + sarq $59, %rsi + movq %rsi, G+4*N(%rsp) + +// Get the initial carries back from storage and do the [u,v] accumulation + + movq tmp, %rbx + movq tmp2, %rbp + +// Digit 0 of [u,v] + + xorl %ecx, %ecx + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V(%rsp) + +// Digit 1 of [u,v] + + xorl %ebx, %ebx + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+N(%rsp) + +// Digit 2 of [u,v] + + xorl %ecx, %ecx + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U+2*N(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V+2*N(%rsp) + +// Digits 3 and 4 of u (top is unsigned) + + movq U+3*N(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+3*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + +// Preload for last use of old u digit 3 + + movq U+3*N(%rsp), %rax + movq %rcx, U+3*N(%rsp) + movq %rdx, U+4*N(%rsp) + +// Digits 3 and 4 of v (top is unsigned) + + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq V+3*N(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rsi, V+3*N(%rsp) + movq %rdx, V+4*N(%rsp) + +// Montgomery reduction of u + + amontred(u) + +// Montgomery reduction of v + + amontred(v) + +bignum_montinv_sm2_midloop: + + divstep59(d,ff,gg) + movq %rsi, d + +// Next iteration + + decq i + jnz bignum_montinv_sm2_loop + +// The 10th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + movq F(%rsp), %rax + movq G(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $63, %rax + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * 2^{-512} [u,v] (mod p_sm2) +// we want to flip the sign of u according to that of f. + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + +// Adjust the initial value to allow for complement instead of negation + + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + +// Digit 0 of [u] + + xorl %r13d, %r13d + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + +// Digit 1 of [u] + + xorl %r14d, %r14d + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + +// Digit 2 of [u] + + xorl %r15d, %r15d + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + +// Digits 3 and 4 of u (top is unsigned) + + movq U+3*N(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq V+3*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + +// Store back and Montgomery reduce u + + movq %r12, U(%rsp) + movq %r13, U+N(%rsp) + movq %r14, U+2*N(%rsp) + movq %r15, U+3*N(%rsp) + movq %r9, U+4*N(%rsp) + + amontred(u) + +// Perform final strict reduction mod p_sm2 and copy to output + + movq U(%rsp), %r8 + movq U+N(%rsp), %r9 + movq U+2*N(%rsp), %r10 + movq U+3*N(%rsp), %r11 + + movl $1, %eax + movl $0x00000000ffffffff, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + bts $32, %rdx + + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovncq %r8, %rax + cmovncq %r9, %rbx + cmovncq %r10, %rcx + cmovncq %r11, %rdx + + movq res, %rdi + movq %rax, (%rdi) + movq %rbx, N(%rdi) + movq %rcx, 2*N(%rdi) + movq %rdx, 3*N(%rdi) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montmul_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montmul_sm2.S new file mode 100644 index 00000000000..e381d54bb82 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montmul_sm2.S @@ -0,0 +1,212 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_sm2 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_sm2 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_sm2, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_sm2 (in particular this is true if we are in +// the "usual" case x < p_sm2 and y < p_sm2). +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_sm2) + .text + +#define z %rdi +#define x %rsi + +// We move the y argument here so we can use %rdx for multipliers + +#define y %rcx + +// Use this fairly consistently for a zero + +#define zero %rbp +#define zeroe %ebp + +// mulpadd(high,low,m) adds %rdx * m to a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rbx as temporaries. + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rbx ; \ + adcxq %rax, low ; \ + adoxq %rbx, high + +// mulpade(high,low,m) adds %rdx * m to a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax as a temporary, assuming high created from scratch +// and that zero has value zero. + +#define mulpade(high,low,m) \ + mulxq m, %rax, high ; \ + adcxq %rax, low ; \ + adoxq zero, high + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d0;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1], and using %rax, %rcx, %rdx and %rbx +// as temporaries. +// --------------------------------------------------------------------------- + +#define montreds(d3,d2,d1,d0) \ + movq d0, %rax ; \ + shlq $32, %rax ; \ + movq d0, %rcx ; \ + shrq $32, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq d0, %rax ; \ + sbbq $0, %rcx ; \ + subq %rax, d1 ; \ + sbbq %rcx, d2 ; \ + sbbq %rdx, d3 ; \ + sbbq %rbx, d0 + +S2N_BN_SYMBOL(bignum_montmul_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Copy y into a safe register to start with + + movq %rdx, y + +// Zero a register, which also makes sure we don't get a fake carry-in + + xorl zeroe, zeroe + +// Do the zeroth row, which is a bit different + + movq (y), %rdx + + mulxq (x), %r8, %r9 + mulxq 8(x), %rax, %r10 + addq %rax, %r9 + mulxq 16(x), %rax, %r11 + adcq %rax, %r10 + mulxq 24(x), %rax, %r12 + adcq %rax, %r11 + adcq zero, %r12 + +// Add row 1 + + xorl zeroe, zeroe + movq 8(y), %rdx + mulpadd(%r10,%r9,(x)) + mulpadd(%r11,%r10,8(x)) + mulpadd(%r12,%r11,16(x)) + mulpade(%r13,%r12,24(x)) + adcxq zero, %r13 + +// Add row 2 + + xorl zeroe, zeroe + movq 16(y), %rdx + mulpadd(%r11,%r10,(x)) + mulpadd(%r12,%r11,8(x)) + mulpadd(%r13,%r12,16(x)) + mulpade(%r14,%r13,24(x)) + adcxq zero, %r14 + +// Add row 3 + + xorl zeroe, zeroe + movq 24(y), %rdx + mulpadd(%r12,%r11,(x)) + mulpadd(%r13,%r12,8(x)) + mulpadd(%r14,%r13,16(x)) + mulpade(%r15,%r14,24(x)) + adcxq zero, %r15 + +// Multiplication complete. Perform 4 Montgomery steps to rotate the lower half + + montreds(%r11,%r10,%r9,%r8) + montreds(%r8,%r11,%r10,%r9) + montreds(%r9,%r8,%r11,%r10) + montreds(%r10,%r9,%r8,%r11) + +// Add high and low parts, catching carry in %rax + + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + +// Load [%r8;%r11;%rbp;%rdx;%rcx] = 2^320 - p_sm2 then do +// [%r8;%r11;%rbp;%rdx;%rcx] = [%rax;%r15;%r14;%r13;%r12] + (2^320 - p_sm2) + + movl $1, %ecx + movl $0x00000000FFFFFFFF, %edx + xorl %ebp, %ebp + addq %r12, %rcx + leaq 1(%rdx), %r11 + adcq %r13, %rdx + leaq -1(%rbp), %r8 + adcq %r14, %rbp + adcq %r15, %r11 + adcq %rax, %r8 + +// Now carry is set if r + (2^320 - p_sm2) >= 2^320, i.e. r >= p_sm2 +// where r is the pre-reduced form. So conditionally select the +// output accordingly. + + cmovcq %rcx, %r12 + cmovcq %rdx, %r13 + cmovcq %rbp, %r14 + cmovcq %r11, %r15 + +// Write back reduced value + + movq %r12, (z) + movq %r13, 8(z) + movq %r14, 16(z) + movq %r15, 24(z) + +// Restore saved registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montmul_sm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montmul_sm2_alt.S new file mode 100644 index 00000000000..23ce5a8dd30 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montmul_sm2_alt.S @@ -0,0 +1,214 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery multiply, z := (x * y / 2^256) mod p_sm2 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_montmul_sm2_alt +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Does z := (2^{-256} * x * y) mod p_sm2, assuming that the inputs x and y +// satisfy x * y <= 2^256 * p_sm2 (in particular this is true if we are in +// the "usual" case x < p_sm2 and y < p_sm2). +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montmul_sm2_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montmul_sm2_alt) + .text + +#define z %rdi +#define x %rsi + +// We move the y argument here so we can use %rdx for multipliers + +#define y %rcx + +// Macro for the key "multiply and add to (c,h,l)" step + +#define combadd(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq $0, c + +// A minutely shorter form for when c = 0 initially + +#define combadz(c,h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h ; \ + adcq c, c + +// A short form where we don't expect a top carry + +#define combads(h,l,numa,numb) \ + movq numa, %rax ; \ + mulq numb; \ + addq %rax, l ; \ + adcq %rdx, h + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d0;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1], and using %rax, %rcx, %rdx and %rbx +// as temporaries. +// --------------------------------------------------------------------------- + +#define montreds(d3,d2,d1,d0) \ + movq d0, %rax ; \ + shlq $32, %rax ; \ + movq d0, %rcx ; \ + shrq $32, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq d0, %rax ; \ + sbbq $0, %rcx ; \ + subq %rax, d1 ; \ + sbbq %rcx, d2 ; \ + sbbq %rdx, d3 ; \ + sbbq %rbx, d0 + +S2N_BN_SYMBOL(bignum_montmul_sm2_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save more registers to play with + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Copy y into a safe register to start with + + movq %rdx, y + +// Start the window as [%r10;%r9;%r8] with 00 product + + movq (x), %rax + mulq (y) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + +// Column 1 + + xorq %r11, %r11 + combads(%r10,%r9,(x),8(y)) + combadz(%r11,%r10,%r9,8(x),(y)) + +// Column 2 + + xorq %r12, %r12 + combadz(%r12,%r11,%r10,(x),16(y)) + combadd(%r12,%r11,%r10,8(x),8(y)) + combadd(%r12,%r11,%r10,16(x),(y)) + +// Column 3 + + xorq %r13, %r13 + combadz(%r13,%r12,%r11,(x),24(y)) + combadd(%r13,%r12,%r11,8(x),16(y)) + combadd(%r13,%r12,%r11,16(x),8(y)) + combadd(%r13,%r12,%r11,24(x),(y)) + +// Column 4 + + xorq %r14, %r14 + combadz(%r14,%r13,%r12,8(x),24(y)) + combadd(%r14,%r13,%r12,16(x),16(y)) + combadd(%r14,%r13,%r12,24(x),8(y)) + +// Column 5 + + xorq %r15, %r15 + combadz(%r15,%r14,%r13,16(x),24(y)) + combadd(%r15,%r14,%r13,24(x),16(y)) + +// Final work for columns 6 and 7 + + movq 24(x), %rax + mulq 24(y) + addq %rax, %r14 + adcq %rdx, %r15 + +// Multiplication complete. Perform 4 Montgomery steps to rotate the lower half + + montreds(%r11,%r10,%r9,%r8) + montreds(%r8,%r11,%r10,%r9) + montreds(%r9,%r8,%r11,%r10) + montreds(%r10,%r9,%r8,%r11) + +// Add high and low parts, catching carry in %rax + + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + +// Load [%r8;%r11;%rbx;%rdx;%rcx] = 2^320 - p_sm2 then do +// [%r8;%r11;%rbx;%rdx;%rcx] = [%rax;%r15;%r14;%r13;%r12] + (2^320 - p_sm2) + + movl $1, %ecx + movl $0x00000000FFFFFFFF, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 1(%rdx), %r11 + adcq %r13, %rdx + leaq -1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + +// Now carry is set if r + (2^320 - p_sm2) >= 2^320, i.e. r >= p_sm2 +// where r is the pre-reduced form. So conditionally select the +// output accordingly. + + cmovcq %rcx, %r12 + cmovcq %rdx, %r13 + cmovcq %rbx, %r14 + cmovcq %r11, %r15 + +// Write back reduced value + + movq %r12, (z) + movq %r13, 8(z) + movq %r14, 16(z) + movq %r15, 24(z) + +// Restore saved registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montsqr_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montsqr_sm2.S new file mode 100644 index 00000000000..5ecefb2c68c --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montsqr_sm2.S @@ -0,0 +1,194 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_sm2, assuming x^2 <= 2^256 * p_sm2, which is +// guaranteed in particular if x < p_sm2 initially (the "intended" case). +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_sm2) + .text + +#define z %rdi +#define x %rsi + +// Use this fairly consistently for a zero + +#define zero %rbp +#define zeroe %ebp + +// Add %rdx * m into a register-pair (high,low) +// maintaining consistent double-carrying with adcx and adox, +// using %rax and %rbx as temporaries + +#define mulpadd(high,low,m) \ + mulxq m, %rax, %rbx ; \ + adcxq %rax, low ; \ + adoxq %rbx, high + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d0;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1], and using %rax, %rcx, %rdx and %rbx +// as temporaries. +// --------------------------------------------------------------------------- + +#define montreds(d3,d2,d1,d0) \ + movq d0, %rax ; \ + shlq $32, %rax ; \ + movq d0, %rcx ; \ + shrq $32, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq d0, %rax ; \ + sbbq $0, %rcx ; \ + subq %rax, d1 ; \ + sbbq %rcx, d2 ; \ + sbbq %rdx, d3 ; \ + sbbq %rbx, d0 + +S2N_BN_SYMBOL(bignum_montsqr_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Compute [%r15;%r8] = [00] which we use later, but mainly +// set up an initial window [%r14;...;%r9] = [23;03;01] + + movq (x), %rdx + mulxq %rdx, %r8, %r15 + mulxq 8(x), %r9, %r10 + mulxq 24(x), %r11, %r12 + movq 16(x), %rdx + mulxq 24(x), %r13, %r14 + +// Clear our zero register, and also initialize the flags for the carry chain + + xorl zeroe, zeroe + +// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible) +// This gives all the "heterogeneous" terms of the squaring ready to double + + mulpadd(%r11,%r10,(x)) + mulpadd(%r12,%r11,8(x)) + movq 24(x), %rdx + mulpadd(%r13,%r12,8(x)) + adcxq zero, %r13 + adoxq zero, %r14 + adcq zero, %r14 + +// Double and add to the 00 + 11 + 22 + 33 terms + + xorl zeroe, zeroe + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 8(x), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 16(x), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 24(x), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq zero, %r15 + adoxq zero, %r15 + +// Squaring complete. Perform 4 Montgomery steps to rotate the lower half + + montreds(%r11,%r10,%r9,%r8) + montreds(%r8,%r11,%r10,%r9) + montreds(%r9,%r8,%r11,%r10) + montreds(%r10,%r9,%r8,%r11) + +// Add high and low parts, catching carry in %rax + + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + +// Load [%r8;%r11;%rbp;%rdx;%rcx] = 2^320 - p_sm2 then do +// [%r8;%r11;%rbp;%rdx;%rcx] = [%rax;%r15;%r14;%r13;%r12] + (2^320 - p_sm2) + + movl $1, %ecx + movl $0x00000000FFFFFFFF, %edx + xorl %ebp, %ebp + addq %r12, %rcx + leaq 1(%rdx), %r11 + adcq %r13, %rdx + leaq -1(%rbp), %r8 + adcq %r14, %rbp + adcq %r15, %r11 + adcq %rax, %r8 + +// Now carry is set if r + (2^320 - p_sm2) >= 2^320, i.e. r >= p_sm2 +// where r is the pre-reduced form. So conditionally select the +// output accordingly. + + cmovcq %rcx, %r12 + cmovcq %rdx, %r13 + cmovcq %rbp, %r14 + cmovcq %r11, %r15 + +// Write back reduced value + + movq %r12, (z) + movq %r13, 8(z) + movq %r14, 16(z) + movq %r15, 24(z) + +// Restore saved registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montsqr_sm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montsqr_sm2_alt.S new file mode 100644 index 00000000000..cd970f10265 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_montsqr_sm2_alt.S @@ -0,0 +1,223 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery square, z := (x^2 / 2^256) mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_montsqr_sm2_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Does z := (x^2 / 2^256) mod p_sm2, assuming x^2 <= 2^256 * p_sm2, which is +// guaranteed in particular if x < p_sm2 initially (the "intended" case). +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montsqr_sm2_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montsqr_sm2_alt) + .text + +#define z %rdi +#define x %rsi + +// Add %rbx * m into a register-pair (high,low) maintaining consistent +// carry-catching with carry (negated, as bitmask) and using %rax and %rdx +// as temporaries + +#define mulpadd(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rbx; \ + subq carry, %rdx ; \ + addq %rax, low ; \ + adcq %rdx, high ; \ + sbbq carry, carry + +// Initial version assuming no carry-in + +#define mulpadi(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rbx; \ + addq %rax, low ; \ + adcq %rdx, high ; \ + sbbq carry, carry + +// End version not catching the top carry-out + +#define mulpade(carry,high,low,m) \ + movq m, %rax ; \ + mulq %rbx; \ + subq carry, %rdx ; \ + addq %rax, low ; \ + adcq %rdx, high + +// --------------------------------------------------------------------------- +// Core one-step "short" Montgomery reduction macro. Takes input in +// [d3;d2;d1;d0] and returns result in [d0;d3;d2;d1], adding to the +// existing contents of [d3;d2;d1], and using %rax, %rcx, %rdx and %rbx +// as temporaries. +// --------------------------------------------------------------------------- + +#define montreds(d3,d2,d1,d0) \ + movq d0, %rax ; \ + shlq $32, %rax ; \ + movq d0, %rcx ; \ + shrq $32, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq d0, %rax ; \ + sbbq $0, %rcx ; \ + subq %rax, d1 ; \ + sbbq %rcx, d2 ; \ + sbbq %rdx, d3 ; \ + sbbq %rbx, d0 + +S2N_BN_SYMBOL(bignum_montsqr_sm2_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save more registers to play with + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + +// Compute [%r15;%r8] = [00] which we use later, but mainly +// set up an initial window [%r14;...;%r9] = [23;03;01] + + movq (x), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 8(x), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 24(x), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 16(x), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + +// Chain in the addition of 02 + 12 + 13 to that window (no carry-out possible) +// This gives all the "heterogeneous" terms of the squaring ready to double + + mulpadi(%rcx,%r11,%r10,(x)) + mulpadd(%rcx,%r12,%r11,8(x)) + movq 24(x), %rbx + mulpade(%rcx,%r13,%r12,8(x)) + adcq $0, %r14 + +// Double the window [%r14;...;%r9], catching top carry in %rcx + + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + +// Add to the 00 + 11 + 22 + 33 terms + + movq 8(x), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 16(x), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 24(x), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + +// Squaring complete. Perform 4 Montgomery steps to rotate the lower half + + montreds(%r11,%r10,%r9,%r8) + montreds(%r8,%r11,%r10,%r9) + montreds(%r9,%r8,%r11,%r10) + montreds(%r10,%r9,%r8,%r11) + +// Add high and low parts, catching carry in %rax + + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + +// Load [%r8;%r11;%rbx;%rdx;%rcx] = 2^320 - p_sm2 then do +// [%r8;%r11;%rbx;%rdx;%rcx] = [%rax;%r15;%r14;%r13;%r12] + (2^320 - p_sm2) + + movl $1, %ecx + movl $0x00000000FFFFFFFF, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 1(%rdx), %r11 + adcq %r13, %rdx + leaq -1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + +// Now carry is set if r + (2^320 - p_sm2) >= 2^320, i.e. r >= p_sm2 +// where r is the pre-reduced form. So conditionally select the +// output accordingly. + + cmovcq %rcx, %r12 + cmovcq %rdx, %r13 + cmovcq %rbx, %r14 + cmovcq %r11, %r15 + +// Write back reduced value + + movq %r12, (z) + movq %r13, 8(z) + movq %r14, 16(z) + movq %r15, 24(z) + +// Restore saved registers and return + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_neg_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_neg_sm2.S new file mode 100644 index 00000000000..05a90e432b0 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_neg_sm2.S @@ -0,0 +1,90 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Negate modulo p_sm2, z := (-x) mod p_sm2, assuming x reduced +// Input x[4]; output z[4] +// +// extern void bignum_neg_sm2 (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_neg_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_neg_sm2) + .text + +#define z %rdi +#define x %rsi + +#define q %rdx + +#define d0 %rax +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n1 %r10 +#define n3 %r11 + +S2N_BN_SYMBOL(bignum_neg_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the input digits as [d3;d2;d1;d0] and also set a bitmask q +// for the input being nonzero, so that we avoid doing -0 = p_sm2 +// and hence maintain strict modular reduction + + movq (x), d0 + movq 8(x), d1 + movq d0, n1 + orq d1, n1 + movq 16(x), d2 + movq 24(x), d3 + movq d2, n3 + orq d3, n3 + orq n1, n3 + negq n3 + sbbq q, q + +// Load the non-trivial words of p_sm2 = [n3;-1;n1;-1] and mask them with q + + movq $0xffffffff00000000, n1 + movq $0xfffffffeffffffff, n3 + andq q, n1 + andq q, n3 + +// Do the subtraction, using an xor for the first digit and getting the +// overall result as [n3;q;n1;d0], all these tweaks just to avoid moves + + xorq q, d0 + subq d1, n1 + sbbq d2, q + sbbq d3, n3 + +// Write back + + movq d0, (z) + movq n1, 8(z) + movq q, 16(z) + movq n3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_optneg_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_optneg_sm2.S new file mode 100644 index 00000000000..f58342adc20 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_optneg_sm2.S @@ -0,0 +1,100 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Optionally negate modulo p_sm2, z := (-x) mod p_sm2 (if p nonzero) or +// z := x (if p zero), assuming x reduced +// Inputs p, x[4]; output z[4] +// +// extern void bignum_optneg_sm2 +// (uint64_t z[static 4], uint64_t p, uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = p, RDX = x +// Microsoft x64 ABI: RCX = z, RDX = p, R8 = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_optneg_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_optneg_sm2) + .text + +#define z %rdi +#define q %rsi +#define x %rdx + +#define n0 %rax +#define n1 %rcx +#define n2 %r8 +#define n3 %r9 + +S2N_BN_SYMBOL(bignum_optneg_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Adjust q by zeroing it if the input is zero (to avoid giving -0 = p_sm2, +// which is not strictly reduced even though it's correct modulo p_sm2). +// This step is redundant if we know a priori that the input is nonzero, which +// is the case for the y coordinate of points on the SM2 curve, for example. + + movq (x), n0 + orq 8(x), n0 + movq 16(x), n1 + orq 24(x), n1 + orq n1, n0 + negq n0 + sbbq n0, n0 + andq n0, q + +// Turn q into a bitmask, all 1s for q=false, all 0s for q=true + + negq q + sbbq q, q + notq q + +// Let [n3;n2;n1;n0] = if q then p_sm2 else -1 + + movq $0xffffffffffffffff, n0 + movq $0xffffffff00000000, n1 + orq q, n1 + movq n0, n2 + movq $0xfffffffeffffffff, n3 + orq q, n3 + +// Subtract so [n3;n2;n1;n0] = if q then p_sm2 - x else -1 - x + + subq (x), n0 + sbbq 8(x), n1 + sbbq 16(x), n2 + sbbq 24(x), n3 + +// XOR the words with the bitmask, which in the case q = false has the +// effect of restoring ~(-1 - x) = -(-1 - x) - 1 = 1 + x - 1 = x +// and write back the digits to the output + + xorq q, n0 + movq n0, (z) + xorq q, n1 + movq n1, 8(z) + xorq q, n2 + movq n2, 16(z) + xorq q, n3 + movq n3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_sub_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_sub_sm2.S new file mode 100644 index 00000000000..d6898b4a070 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_sub_sm2.S @@ -0,0 +1,89 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Subtract modulo p_sm2, z := (x - y) mod p_sm2 +// Inputs x[4], y[4]; output z[4] +// +// extern void bignum_sub_sm2 +// (uint64_t z[static 4], uint64_t x[static 4], uint64_t y[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y +// Microsoft x64 ABI: RCX = z, RDX = x, R8 = y +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_sub_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_sub_sm2) + .text + +#define z %rdi +#define x %rsi +#define y %rdx + +#define d0 %rax +#define d1 %rcx +#define d2 %r8 +#define d3 %r9 + +#define n1 %r10 +#define n3 %rdx +#define c %r11 + +#define n1short %r10d + + + +S2N_BN_SYMBOL(bignum_sub_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Load and subtract the two inputs as [d3;d2;d1;d0] = x - y (modulo 2^256) + + movq (x), d0 + subq (y), d0 + movq 8(x), d1 + sbbq 8(y), d1 + movq 16(x), d2 + sbbq 16(y), d2 + movq 24(x), d3 + sbbq 24(y), d3 + +// Capture the carry, which indicates x < y, and create corresponding masked +// correction p_sm2' = [n3; c; n1; c] to add + + movq $0xffffffff00000000, n1 + sbbq c, c + andq c, n1 + movq c, n3 + btr $32, n3 + +// Do the corrective addition and copy to output + + addq c, d0 + movq d0, (z) + adcq n1, d1 + movq d1, 8(z) + adcq c, d2 + movq d2, 16(z) + adcq n3, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_tomont_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_tomont_sm2.S new file mode 100644 index 00000000000..514c8b93c09 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_tomont_sm2.S @@ -0,0 +1,144 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Convert to Montgomery form z := (2^256 * x) mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_tomont_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_tomont_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_tomont_sm2) + .text + +#define z %rdi +#define x %rsi + +#define m0 %r8 +#define m1 %r9 +#define m2 %r10 +#define m3 %r11 + +#define q %rax +#define n1 %rcx +#define n3 %rdx +#define qshort %eax + +// ---------------------------------------------------------------------------- +// Core "x |-> (2^64 * x) mod p_sm2" macro, with x assumed to be < p_sm2. +// Input is [d3;d2;d1;d0] and output is [d2;d1;d0;q] where q is a fixed +// quotient estimate register (%rax), so the registers get shuffled. +// ---------------------------------------------------------------------------- + +#define modstep_sm2(d3,d2,d1,d0) \ +/* Writing the input, with lower zero digit appended, as */ \ +/* z = 2^256 * d3 + 2^192 * d2 + t, quotient approximation is */ \ +/* MIN ((d3 * (1 + 2^32 + 2^64) + d2 + 2^64) >> 64) (2^64 - 1) */ \ + movq d2, n1 ; \ + movl $1, qshort ; \ + addq d3, n1 ; \ + adcq d3, q ; \ + shrq $32, n1 ; \ + addq d3, n1 ; \ + shrq $32, n1 ; \ + addq n1, q ; \ + sbbq $0, q ; \ +/* Compute the pre-reduced [d3;d2;d1;d0;q] = m - p_sm2 * q */ \ +/* = z - (2^256 - 2^224 - 2^96 + 2^64 - 1) * q */ \ + movq q, n1 ; \ + movq q, n3 ; \ + shlq $32, n1 ; \ + shrq $32, n3 ; \ + addq n1, d2 ; \ + adcq n3, d3 ; \ + subq q, n1 ; \ + sbbq $0, n3 ; \ + subq q, d3 ; \ + addq n1, d0 ; \ + adcq n3, d1 ; \ + adcq $0, d2 ; \ + adcq $0, d3 ; \ +/* Corrective addition with top word d3 as a bitmask */ \ + movq $0xffffffff00000000, n1 ; \ + andq d3, n1 ; \ + movq $0xfffffffeffffffff, n3 ; \ + andq d3, n3 ; \ + addq d3, q ; \ + adcq n1, d0 ; \ + adcq d3, d1 ; \ + adcq n3, d2 + +S2N_BN_SYMBOL(bignum_tomont_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Load the inputs + + movq (x), m0 + movq 8(x), m1 + movq 16(x), m2 + movq 24(x), m3 + +// Load non-trivial digits [n3; -1; n1; -1] = p_sm2 and do a conditional +// subtraction to reduce the four starting digits [m3;m2;m1;m0] modulo p_sm2 + + subq $-1, m0 + movq $0xffffffff00000000, n1 + sbbq n1, m1 + movq $0xfffffffeffffffff, n3 + sbbq $-1, m2 + sbbq n3, m3 + sbbq q, q + andq q, n1 + andq q, n3 + addq q, m0 + adcq n1, m1 + adcq q, m2 + adcq n3, m3 + +// Now do 4 iterations of 5->4 word modular reduction + + modstep_sm2(m3,m2,m1,m0) + + movq q, m3 + + modstep_sm2(m2,m1,m0,m3) + + movq q, m2 + + modstep_sm2(m1,m0,m3,m2) + + movq q, m1 + + modstep_sm2(m0,m3,m2,m1) + +// Write back result and return + + movq q, (z) + movq m1, 8(z) + movq m2, 16(z) + movq m3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_triple_sm2.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_triple_sm2.S new file mode 100644 index 00000000000..4e19aa85f19 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_triple_sm2.S @@ -0,0 +1,128 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Triple modulo p_sm2, z := (3 * x) mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_triple_sm2 +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The input x can be any 4-digit bignum, not necessarily reduced modulo p_sm2, +// and the result is always fully reduced, i.e. z = (3 * x) mod p_sm2. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_sm2) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_sm2) + .text + +#define z %rdi +#define x %rsi + +// Main digits of intermediate results + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +// Quotient estimate = top of product + 1 + +#define q %rdx +#define h %rdx +#define qshort %edx + +// Other temporary variables and their short version + +#define a %rax +#define c %rcx + +#define ashort %eax + +S2N_BN_SYMBOL(bignum_triple_sm2): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// First do the multiplication by 3, getting z = [h; d3; ...; d0] +// but immediately form the quotient estimate q = h + 1 + + xorl ashort, ashort + + movq (x), q + movq q, d0 + adcxq q, q + adoxq q, d0 + movq 8(x), q + movq q, d1 + adcxq q, q + adoxq q, d1 + movq 16(x), q + movq q, d2 + adcxq q, q + adoxq q, d2 + movq 24(x), q + movq q, d3 + adcxq q, q + adoxq q, d3 + +// For this limited range a simple quotient estimate of q = h + 1 works, where +// h = floor(z / 2^256). Then -p_sm2 <= z - q * p_sm2 < p_sm2, so we just need +// to subtract q * p_sm2 and then if that's negative, add back p_sm2. + + movl $1, qshort + adcxq a, q + adoxq a, q + +// Now compute the initial pre-reduced [h;d3;d2;d1;d0] = z - p_sm2 * q +// = z - (2^256 - 2^224 - 2^96 + 2^64 - 1) * q + + movq q, a + shlq $32, a + movq a, c + subq q, a + + addq q, d0 + adcq a, d1 + adcq $0, d2 + adcq c, d3 + sbbq h, h + notq h + +// Now our top word h is either zero or all 1s, and we use this to discriminate +// whether a correction is needed because our result is negative, as a bitmask +// Do a masked addition of p_sm2 + + movq $0xffffffff00000000, a + andq h, a + movq $0xfffffffeffffffff, c + andq h, c + addq h, d0 + movq d0, (z) + adcq a, d1 + movq d1, 8(z) + adcq h, d2 + movq d2, 16(z) + adcq c, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_triple_sm2_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_triple_sm2_alt.S new file mode 100644 index 00000000000..a06d91f8097 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/bignum_triple_sm2_alt.S @@ -0,0 +1,131 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Triple modulo p_sm2, z := (3 * x) mod p_sm2 +// Input x[4]; output z[4] +// +// extern void bignum_triple_sm2_alt +// (uint64_t z[static 4], uint64_t x[static 4]); +// +// The input x can be any 4-digit bignum, not necessarily reduced modulo p_sm2, +// and the result is always fully reduced, i.e. z = (3 * x) mod p_sm2. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_triple_sm2_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_triple_sm2_alt) + .text + +#define z %rdi +#define x %rsi + +// Main digits of intermediate results + +#define d0 %r8 +#define d1 %r9 +#define d2 %r10 +#define d3 %r11 + +// Quotient estimate = top of product + 1 + +#define q %rdx +#define h %rdx + +// Other temporary variables and their short version + +#define a %rax +#define c %rcx +#define d %rdx + +#define ashort %eax +#define cshort %ecx + +S2N_BN_SYMBOL(bignum_triple_sm2_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// First do the multiplication by 3, getting z = [h; d3; ...; d0] +// but immediately form the quotient estimate q = h + 1 + + movl $3, cshort + + movq (x), a + mulq c + movq a, d0 + movq d, d1 + + movq 8(x), a + xorq d2, d2 + mulq c + addq a, d1 + adcq d, d2 + + movq 16(x), a + xorq d3, d3 + mulq c + addq a, d2 + adcq d, d3 + + movq 24(x), a + mulq c + addq a, d3 + +// For this limited range a simple quotient estimate of q = h + 1 works, where +// h = floor(z / 2^256). Then -p_sm2 <= z - q * p_sm2 < p_sm2, so we just need +// to subtract q * p_sm2 and then if that's negative, add back p_sm2. + + adcq $1, q + +// Now compute the initial pre-reduced [h;d3;d2;d1;d0] = z - p_sm2 * q +// = z - (2^256 - 2^224 - 2^96 + 2^64 - 1) * q + + movq q, a + shlq $32, a + movq a, c + subq q, a + + addq q, d0 + adcq a, d1 + adcq $0, d2 + adcq c, d3 + sbbq h, h + notq h + +// Now our top word h is either zero or all 1s, and we use this to discriminate +// whether a correction is needed because our result is negative, as a bitmask +// Do a masked addition of p_sm2 + + movq $0xffffffff00000000, a + andq h, a + movq $0xfffffffeffffffff, c + andq h, c + addq h, d0 + movq d0, (z) + adcq a, d1 + movq d1, 8(z) + adcq h, d2 + movq d2, 16(z) + adcq c, d3 + movq d3, 24(z) + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjadd.S new file mode 100644 index 00000000000..75313535e71 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjadd.S @@ -0,0 +1,621 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates +// +// extern void sm2_montjadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjadd) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rbp = p2, +// which needs to be set up explicitly before use. +// By design, none of the code macros modify any of +// these, so we maintain the assignments throughout. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) +#define z_2 (2*NUMSIZE)(%rbp) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define x1a (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define y1a (NUMSIZE*6)(%rsp) + +#define NSPACE (NUMSIZE*7) + +// Corresponds to bignum_montmul_sm2 except for registers + +#define montmul_sm2(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + xorl %ecx, %ecx ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rcx, %r13 ; \ + xorl %ecx, %ecx ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcxq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcxq %rcx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + xorl %eax, %eax ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + adcq %rax, %rax ; \ + movl $0x1, %ecx ; \ + movl $0xffffffff, %edx ; \ + xorl %ebx, %ebx ; \ + addq %r12, %rcx ; \ + leaq 0x1(%rdx), %r11 ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbx), %r8 ; \ + adcq %r14, %rbx ; \ + adcq %r15, %r11 ; \ + adcq %rax, %r8 ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds to bignum_montsqr_sm2 except for registers + +#define montsqr_sm2(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + xorl %eax, %eax ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + adcq %rax, %rax ; \ + movl $0x1, %ecx ; \ + movl $0xffffffff, %edx ; \ + xorl %ebx, %ebx ; \ + addq %r12, %rcx ; \ + leaq 0x1(%rdx), %r11 ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbx), %r8 ; \ + adcq %r14, %rbx ; \ + adcq %r15, %r11 ; \ + adcq %rax, %r8 ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). + +#define amontsqr_sm2(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + sbbq %rax, %rax ; \ + movq $0xffffffff00000000, %rbx ; \ + movq %rax, %rcx ; \ + andq %rax, %rbx ; \ + btr $32, %rcx ; \ + subq %rax, %r12 ; \ + sbbq %rbx, %r13 ; \ + sbbq %rax, %r14 ; \ + sbbq %rcx, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_sub_sm2 + +#define sub_sm2(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + sbbq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq %r11, %r11 ; \ + andq %r11, %r10 ; \ + movq %r11, %rdx ; \ + btr $0x20, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq %r11, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// Additional macros to help with final multiplexing + +#define load4(r0,r1,r2,r3,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 + +#define store4(P,r0,r1,r2,r3) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P + +#define czload4(r0,r1,r2,r3,P) \ + cmovzq P, r0 ; \ + cmovzq 8+P, r1 ; \ + cmovzq 16+P, r2 ; \ + cmovzq 24+P, r3 + +#define muxload4(r0,r1,r2,r3,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 + +S2N_BN_SYMBOL(sm2_montjadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input y in %rbp where it lasts throughout the main code. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdx, %rbp + +// Main code, just a sequence of basic field operations +// 12 * multiply + 4 * square + 7 * subtract + + amontsqr_sm2(z1sq,z_1) + amontsqr_sm2(z2sq,z_2) + + montmul_sm2(y1a,z_2,y_1) + montmul_sm2(y2a,z_1,y_2) + + montmul_sm2(x2a,z1sq,x_2) + montmul_sm2(x1a,z2sq,x_1) + montmul_sm2(y2a,z1sq,y2a) + montmul_sm2(y1a,z2sq,y1a) + + sub_sm2(xd,x2a,x1a) + sub_sm2(yd,y2a,y1a) + + amontsqr_sm2(zz,xd) + montsqr_sm2(ww,yd) + + montmul_sm2(zzx1,zz,x1a) + montmul_sm2(zzx2,zz,x2a) + + sub_sm2(resx,ww,zzx1) + sub_sm2(t1,zzx2,zzx1) + + montmul_sm2(xd,xd,z_1) + + sub_sm2(resx,resx,zzx2) + + sub_sm2(t2,zzx1,resx) + + montmul_sm2(t1,t1,y1a) + + montmul_sm2(resz,xd,z_2) + montmul_sm2(t2,yd,t2) + + sub_sm2(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) + + load4(%r8,%r9,%r10,%r11,z_1) + + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + + load4(%r12,%r13,%r14,%r15,z_2) + + movq %r12, %rbx + movq %r13, %rdx + orq %r14, %rbx + orq %r15, %rdx + orq %rdx, %rbx + negq %rbx + sbbq %rbx, %rbx + + cmpq %rax, %rbx + +// Multiplex the outputs accordingly, re-using the z's in registers + + cmovbq %r8, %r12 + cmovbq %r9, %r13 + cmovbq %r10, %r14 + cmovbq %r11, %r15 + + czload4(%r12,%r13,%r14,%r15,resz) + + muxload4(%rax,%rbx,%rcx,%rdx,resx,x_1,x_2) + muxload4(%r8,%r9,%r10,%r11,resy,y_1,y_2) + +// Finally store back the multiplexed values + + store4(x_3,%rax,%rbx,%rcx,%rdx) + store4(y_3,%r8,%r9,%r10,%r11) + store4(z_3,%r12,%r13,%r14,%r15) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjadd_alt.S new file mode 100644 index 00000000000..6e91054a0ad --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjadd_alt.S @@ -0,0 +1,559 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates +// +// extern void sm2_montjadd_alt +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 12]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjadd_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rbp = p2, +// which needs to be set up explicitly before use. +// By design, none of the code macros modify any of +// these, so we maintain the assignments throughout. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) +#define z_2 (2*NUMSIZE)(%rbp) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z1sq (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define x1a (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define z2sq (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define y1a (NUMSIZE*6)(%rsp) + +#define NSPACE (NUMSIZE*7) +// Corresponds to bignum_montmul_sm2_alt except for registers + +#define montmul_sm2(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + xorl %eax, %eax ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + adcq %rax, %rax ; \ + movl $0x1, %ecx ; \ + movl $0xffffffff, %edx ; \ + xorl %ebx, %ebx ; \ + addq %r12, %rcx ; \ + leaq 0x1(%rdx), %r11 ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbx), %r8 ; \ + adcq %r14, %rbx ; \ + adcq %r15, %r11 ; \ + adcq %rax, %r8 ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds to bignum_montsqr_sm2_alt except for registers + +#define montsqr_sm2(P0,P1) \ + movq P1, %rax ; \ + movq %rax, %rbx ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r9 ; \ + movq %rdx, %r10 ; \ + movq 0x18+P1, %rax ; \ + movq %rax, %r13 ; \ + mulq %rbx; \ + movq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + movq 0x10+P1, %rax ; \ + movq %rax, %rbx ; \ + mulq %r13; \ + movq %rax, %r13 ; \ + movq %rdx, %r14 ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + movq 0x18+P1, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorl %ecx, %ecx ; \ + addq %r9, %r9 ; \ + adcq %r10, %r10 ; \ + adcq %r11, %r11 ; \ + adcq %r12, %r12 ; \ + adcq %r13, %r13 ; \ + adcq %r14, %r14 ; \ + adcq %rcx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %r15, %r9 ; \ + adcq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + negq %r15; \ + adcq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + negq %r15; \ + adcq %rax, %r14 ; \ + adcq %rcx, %rdx ; \ + movq %rdx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + xorl %eax, %eax ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + adcq %rax, %rax ; \ + movl $0x1, %ecx ; \ + movl $0xffffffff, %edx ; \ + xorl %ebx, %ebx ; \ + addq %r12, %rcx ; \ + leaq 0x1(%rdx), %r11 ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbx), %r8 ; \ + adcq %r14, %rbx ; \ + adcq %r15, %r11 ; \ + adcq %rax, %r8 ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_sub_sm2 + +#define sub_sm2(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + sbbq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq %r11, %r11 ; \ + andq %r11, %r10 ; \ + movq %r11, %rdx ; \ + btr $0x20, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq %r11, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// Additional macros to help with final multiplexing + +#define load4(r0,r1,r2,r3,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 + +#define store4(P,r0,r1,r2,r3) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P + +#define czload4(r0,r1,r2,r3,P) \ + cmovzq P, r0 ; \ + cmovzq 8+P, r1 ; \ + cmovzq 16+P, r2 ; \ + cmovzq 24+P, r3 + +#define muxload4(r0,r1,r2,r3,P0,P1,P2) \ + movq P0, r0 ; \ + cmovbq P1, r0 ; \ + cmovnbe P2, r0 ; \ + movq 8+P0, r1 ; \ + cmovbq 8+P1, r1 ; \ + cmovnbe 8+P2, r1 ; \ + movq 16+P0, r2 ; \ + cmovbq 16+P1, r2 ; \ + cmovnbe 16+P2, r2 ; \ + movq 24+P0, r3 ; \ + cmovbq 24+P1, r3 ; \ + cmovnbe 24+P2, r3 + +S2N_BN_SYMBOL(sm2_montjadd_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input y in %rbp where it lasts throughout the main code. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdx, %rbp + +// Main code, just a sequence of basic field operations +// 12 * multiply + 4 * square + 7 * subtract + + montsqr_sm2(z1sq,z_1) + montsqr_sm2(z2sq,z_2) + + montmul_sm2(y1a,z_2,y_1) + montmul_sm2(y2a,z_1,y_2) + + montmul_sm2(x2a,z1sq,x_2) + montmul_sm2(x1a,z2sq,x_1) + montmul_sm2(y2a,z1sq,y2a) + montmul_sm2(y1a,z2sq,y1a) + + sub_sm2(xd,x2a,x1a) + sub_sm2(yd,y2a,y1a) + + montsqr_sm2(zz,xd) + montsqr_sm2(ww,yd) + + montmul_sm2(zzx1,zz,x1a) + montmul_sm2(zzx2,zz,x2a) + + sub_sm2(resx,ww,zzx1) + sub_sm2(t1,zzx2,zzx1) + + montmul_sm2(xd,xd,z_1) + + sub_sm2(resx,resx,zzx2) + + sub_sm2(t2,zzx1,resx) + + montmul_sm2(t1,t1,y1a) + + montmul_sm2(resz,xd,z_2) + montmul_sm2(t2,yd,t2) + + sub_sm2(resy,t2,t1) + +// Load in the z coordinates of the inputs to check for P1 = 0 and P2 = 0 +// The condition codes get set by a comparison (P2 != 0) - (P1 != 0) +// So "NBE" <=> ~(CF \/ ZF) <=> P1 = 0 /\ ~(P2 = 0) +// and "B" <=> CF <=> ~(P1 = 0) /\ P2 = 0 +// and "Z" <=> ZF <=> (P1 = 0 <=> P2 = 0) + + load4(%r8,%r9,%r10,%r11,z_1) + + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + + load4(%r12,%r13,%r14,%r15,z_2) + + movq %r12, %rbx + movq %r13, %rdx + orq %r14, %rbx + orq %r15, %rdx + orq %rdx, %rbx + negq %rbx + sbbq %rbx, %rbx + + cmpq %rax, %rbx + +// Multiplex the outputs accordingly, re-using the z's in registers + + cmovbq %r8, %r12 + cmovbq %r9, %r13 + cmovbq %r10, %r14 + cmovbq %r11, %r15 + + czload4(%r12,%r13,%r14,%r15,resz) + + muxload4(%rax,%rbx,%rcx,%rdx,resx,x_1,x_2) + muxload4(%r8,%r9,%r10,%r11,resy,y_1,y_2) + +// Finally store back the multiplexed values + + store4(x_3,%rax,%rbx,%rcx,%rdx) + store4(y_3,%r8,%r9,%r10,%r11) + store4(z_3,%r12,%r13,%r14,%r15) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjdouble.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjdouble.S new file mode 100644 index 00000000000..33e1cb1a46e --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjdouble.S @@ -0,0 +1,648 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates +// +// extern void sm2_montjdouble +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjdouble) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjdouble) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1, which is true when the +// arguments come in initially and is not disturbed throughout. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 (NUMSIZE*0)(%rsp) +#define y4 (NUMSIZE*0)(%rsp) + +#define y2 (NUMSIZE*1)(%rsp) + +#define t1 (NUMSIZE*2)(%rsp) + +#define t2 (NUMSIZE*3)(%rsp) +#define x2p (NUMSIZE*3)(%rsp) +#define dx2 (NUMSIZE*3)(%rsp) + +#define xy2 (NUMSIZE*4)(%rsp) + +#define x4p (NUMSIZE*5)(%rsp) +#define d (NUMSIZE*5)(%rsp) + +#define NSPACE (NUMSIZE*6) + +// Corresponds to bignum_montmul_sm2 except for registers + +#define montmul_sm2(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + xorl %ecx, %ecx ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rcx, %r13 ; \ + xorl %ecx, %ecx ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcxq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcxq %rcx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + xorl %eax, %eax ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + adcq %rax, %rax ; \ + movl $0x1, %ecx ; \ + movl $0xffffffff, %edx ; \ + xorl %ebx, %ebx ; \ + addq %r12, %rcx ; \ + leaq 0x1(%rdx), %r11 ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbx), %r8 ; \ + adcq %r14, %rbx ; \ + adcq %r15, %r11 ; \ + adcq %rax, %r8 ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds to bignum_montsqr_sm2 except for registers + +#define montsqr_sm2(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + xorl %eax, %eax ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + adcq %rax, %rax ; \ + movl $0x1, %ecx ; \ + movl $0xffffffff, %edx ; \ + xorl %ebx, %ebx ; \ + addq %r12, %rcx ; \ + leaq 0x1(%rdx), %r11 ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbx), %r8 ; \ + adcq %r14, %rbx ; \ + adcq %r15, %r11 ; \ + adcq %rax, %r8 ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_sub_sm2 + +#define sub_sm2(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + sbbq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq %r11, %r11 ; \ + andq %r11, %r10 ; \ + movq %r11, %rdx ; \ + btr $0x20, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq %r11, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// Corresponds exactly to bignum_add_sm2 + +#define add_sm2(P0,P1,P2) \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + adcq %r11, %r11 ; \ + subq $0xffffffffffffffff, %rax ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq %r10, %rcx ; \ + sbbq $0xffffffffffffffff, %r8 ; \ + movq $0xfffffffeffffffff, %rdx ; \ + sbbq %rdx, %r9 ; \ + sbbq $0x0, %r11 ; \ + andq %r11, %r10 ; \ + andq %r11, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq %r11, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// A weak version of add that only guarantees sum in 4 digits + +#define weakadd_sm2(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq %r11, %r11 ; \ + andq %r11, %r10 ; \ + movq %r11, %rdx ; \ + btr $0x20, %rdx ; \ + subq %r11, %rax ; \ + movq %rax, P0 ; \ + sbbq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + sbbq %r11, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// P0 = C * P1 - D * P2 computed as d * (p_sm2 - P2) + c * P1 +// Quotient estimation is done just as q = h + 1 as in bignum_triple_sm2 +// This also applies to the other functions following. + +#define cmsub_sm2(P0,C,P1,D,P2) \ + /* First (%r11;%r10;%r9;%r8) = p_sm2 - P2 */ \ + movq $0xffffffffffffffff, %r8 ; \ + movq %r8, %r10 ; \ + subq P2, %r8 ; \ + movq $0xffffffff00000000, %r9 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + movq $0xfffffffeffffffff, %r11 ; \ + sbbq 0x18+P2, %r11 ; \ + /* (%r12;%r11;%r10;%r9;%r8) = D * (p_sm2 - P2) */ \ + xorl %r12d, %r12d ; \ + movq $D, %rdx ; \ + mulxq %r8, %r8, %rax ; \ + mulxq %r9, %r9, %rcx ; \ + addq %rax, %r9 ; \ + mulxq %r10, %r10, %rax ; \ + adcq %rcx, %r10 ; \ + mulxq %r11, %r11, %rcx ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + /* (%rdx;%r11;%r10;%r9;%r8) = 2^256 + C * P1 + D * (p_sm2 - P2) */ \ + movq $C, %rdx ; \ + xorl %eax, %eax ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq 0x10+P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x18+P1, %rax, %rdx ; \ + adcxq %rax, %r11 ; \ + adoxq %r12, %rdx ; \ + adcq $1, %rdx ; \ + /* Now the tail for modular reduction from tripling */ \ + movq %rdx, %rax ; \ + shlq $0x20, %rax ; \ + movq %rax, %rcx ; \ + subq %rdx, %rax ; \ + addq %rdx, %r8 ; \ + adcq %rax, %r9 ; \ + adcq $0x0, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rdx, %rdx ; \ + notq %rdx; \ + movq $0xffffffff00000000, %rax ; \ + andq %rdx, %rax ; \ + movq %rdx, %rcx ; \ + btr $0x20, %rcx ; \ + addq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq %rdx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 0x18+P0 + +// P0 = 3 * P1 - 8 * P2, computed as (p_sm2 - P2) << 3 + 3 * P1 + +#define cmsub38_sm2(P0,P1,P2) \ + /* First (%r11;%r10;%r9;%r8) = p_sm2 - P2 */ \ + movq $0xffffffffffffffff, %r8 ; \ + movq %r8, %r10 ; \ + subq P2, %r8 ; \ + movq $0xffffffff00000000, %r9 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + movq $0xfffffffeffffffff, %r11 ; \ + sbbq 0x18+P2, %r11 ; \ + /* (%r12;%r11;%r10;%r9;%r8) = (p_sm2 - P2) << 3 */ \ + movq %r11, %r12 ; \ + shldq $3, %r10, %r11 ; \ + shldq $3, %r9, %r10 ; \ + shldq $3, %r8, %r9 ; \ + shlq $3, %r8 ; \ + shrq $61, %r12 ; \ + /* (%rdx;%r11;%r10;%r9;%r8) = 2^256 + 3 * P1 + 8 * (p_sm2 - P2) */ \ + movq $3, %rdx ; \ + xorl %eax, %eax ; \ + mulxq P1, %rax, %rcx ; \ + adcxq %rax, %r8 ; \ + adoxq %rcx, %r9 ; \ + mulxq 0x8+P1, %rax, %rcx ; \ + adcxq %rax, %r9 ; \ + adoxq %rcx, %r10 ; \ + mulxq 0x10+P1, %rax, %rcx ; \ + adcxq %rax, %r10 ; \ + adoxq %rcx, %r11 ; \ + mulxq 0x18+P1, %rax, %rdx ; \ + adcxq %rax, %r11 ; \ + adoxq %r12, %rdx ; \ + adcq $1, %rdx ; \ + /* Now the tail for modular reduction from tripling */ \ + movq %rdx, %rax ; \ + shlq $0x20, %rax ; \ + movq %rax, %rcx ; \ + subq %rdx, %rax ; \ + addq %rdx, %r8 ; \ + adcq %rax, %r9 ; \ + adcq $0x0, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rdx, %rdx ; \ + notq %rdx; \ + movq $0xffffffff00000000, %rax ; \ + andq %rdx, %rax ; \ + movq %rdx, %rcx ; \ + btr $0x20, %rcx ; \ + addq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq %rdx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 0x18+P0 + +// P0 = 4 * P1 - P2, by direct subtraction of P2, +// since the quotient estimate still works safely +// for initial value > -p_sm2 + +#define cmsub41_sm2(P0,P1,P2) \ + movq 0x18+P1, %r11 ; \ + movq %r11, %rdx ; \ + movq 0x10+P1, %r10 ; \ + shldq $2, %r10, %r11 ; \ + movq 0x8+P1, %r9 ; \ + shldq $2, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $2, %r8, %r9 ; \ + shlq $2, %r8 ; \ + shrq $62, %rdx ; \ + addq $1, %rdx ; \ + subq P2, %r8 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + sbbq 0x18+P2, %r11 ; \ + sbbq $0, %rdx ; \ + /* Now the tail for modular reduction from tripling */ \ + movq %rdx, %rax ; \ + shlq $0x20, %rax ; \ + movq %rax, %rcx ; \ + subq %rdx, %rax ; \ + addq %rdx, %r8 ; \ + adcq %rax, %r9 ; \ + adcq $0x0, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rdx, %rdx ; \ + notq %rdx; \ + movq $0xffffffff00000000, %rax ; \ + andq %rdx, %rax ; \ + movq %rdx, %rcx ; \ + btr $0x20, %rcx ; \ + addq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq %rdx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 0x18+P0 + +S2N_BN_SYMBOL(sm2_montjdouble): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main code, just a sequence of basic field operations +// z2 = z^2 +// y2 = y^2 + + montsqr_sm2(z2,z_1) + montsqr_sm2(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + sub_sm2(t2,x_1,z2) + weakadd_sm2(t1,x_1,z2) + montmul_sm2(x2p,t1,t2) + +// t1 = y + z +// xy2 = x * y^2 +// x4p = x2p^2 + + add_sm2(t1,y_1,z_1) + montmul_sm2(xy2,x_1,y2) + montsqr_sm2(x4p,x2p) + +// t1 = (y + z)^2 + + montsqr_sm2(t1,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_sm2(d,12,xy2,9,x4p) + sub_sm2(t1,t1,z2) + +// y4 = y^4 + + montsqr_sm2(y4,y2) + +// dx2 = d * x2p + + montmul_sm2(dx2,d,x2p) + +// z_3' = 2 * y * z + + sub_sm2(z_3,t1,y2) + +// x' = 4 * xy2 - d + + cmsub41_sm2(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_sm2(y_3,dx2,y4) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjdouble_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjdouble_alt.S new file mode 100644 index 00000000000..d7d33851c73 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjdouble_alt.S @@ -0,0 +1,727 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point doubling on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates +// +// extern void sm2_montjdouble_alt +// (uint64_t p3[static 12],uint64_t p1[static 12]); +// +// Does p3 := 2 * p1 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// +// Standard x86-64 ABI: RDI = p3, RSI = p1 +// Microsoft x64 ABI: RCX = p3, RDX = p1 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjdouble_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjdouble_alt) + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1, which is true when the +// arguments come in initially and is not disturbed throughout. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define z2 (NUMSIZE*0)(%rsp) +#define y4 (NUMSIZE*0)(%rsp) + +#define y2 (NUMSIZE*1)(%rsp) + +#define t1 (NUMSIZE*2)(%rsp) + +#define t2 (NUMSIZE*3)(%rsp) +#define x2p (NUMSIZE*3)(%rsp) +#define dx2 (NUMSIZE*3)(%rsp) + +#define xy2 (NUMSIZE*4)(%rsp) + +#define x4p (NUMSIZE*5)(%rsp) +#define d (NUMSIZE*5)(%rsp) + +#define NSPACE (NUMSIZE*6) + +// Corresponds to bignum_montmul_sm2_alt except for registers + +#define montmul_sm2(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + xorl %eax, %eax ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + adcq %rax, %rax ; \ + movl $0x1, %ecx ; \ + movl $0xffffffff, %edx ; \ + xorl %ebx, %ebx ; \ + addq %r12, %rcx ; \ + leaq 0x1(%rdx), %r11 ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbx), %r8 ; \ + adcq %r14, %rbx ; \ + adcq %r15, %r11 ; \ + adcq %rax, %r8 ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds to bignum_montsqr_sm2_alt except for registers + +#define montsqr_sm2(P0,P1) \ + movq P1, %rax ; \ + movq %rax, %rbx ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r9 ; \ + movq %rdx, %r10 ; \ + movq 0x18+P1, %rax ; \ + movq %rax, %r13 ; \ + mulq %rbx; \ + movq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + movq 0x10+P1, %rax ; \ + movq %rax, %rbx ; \ + mulq %r13; \ + movq %rax, %r13 ; \ + movq %rdx, %r14 ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + movq 0x18+P1, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorl %ecx, %ecx ; \ + addq %r9, %r9 ; \ + adcq %r10, %r10 ; \ + adcq %r11, %r11 ; \ + adcq %r12, %r12 ; \ + adcq %r13, %r13 ; \ + adcq %r14, %r14 ; \ + adcq %rcx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %r15, %r9 ; \ + adcq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + negq %r15; \ + adcq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + negq %r15; \ + adcq %rax, %r14 ; \ + adcq %rcx, %rdx ; \ + movq %rdx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + xorl %eax, %eax ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + adcq %rax, %rax ; \ + movl $0x1, %ecx ; \ + movl $0xffffffff, %edx ; \ + xorl %ebx, %ebx ; \ + addq %r12, %rcx ; \ + leaq 0x1(%rdx), %r11 ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbx), %r8 ; \ + adcq %r14, %rbx ; \ + adcq %r15, %r11 ; \ + adcq %rax, %r8 ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_sub_sm2 + +#define sub_sm2(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + sbbq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq %r11, %r11 ; \ + andq %r11, %r10 ; \ + movq %r11, %rdx ; \ + btr $0x20, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq %r11, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// Corresponds exactly to bignum_add_sm2 + +#define add_sm2(P0,P1,P2) \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + adcq %r11, %r11 ; \ + subq $0xffffffffffffffff, %rax ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq %r10, %rcx ; \ + sbbq $0xffffffffffffffff, %r8 ; \ + movq $0xfffffffeffffffff, %rdx ; \ + sbbq %rdx, %r9 ; \ + sbbq $0x0, %r11 ; \ + andq %r11, %r10 ; \ + andq %r11, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq %r11, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// A weak version of add that only guarantees sum in 4 digits + +#define weakadd_sm2(P0,P1,P2) \ + movq P1, %rax ; \ + addq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + adcq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + adcq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + adcq 0x18+P2, %r9 ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq %r11, %r11 ; \ + andq %r11, %r10 ; \ + movq %r11, %rdx ; \ + btr $0x20, %rdx ; \ + subq %r11, %rax ; \ + movq %rax, P0 ; \ + sbbq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + sbbq %r11, %r8 ; \ + movq %r8, 0x10+P0 ; \ + sbbq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// P0 = C * P1 - D * P2 computed as d * (p_sm2 - P2) + c * P1 +// Quotient estimation is done just as q = h + 1 as in bignum_triple_sm2 +// This also applies to the other functions following. + +#define cmsub_sm2(P0,C,P1,D,P2) \ + /* First (%r12;%r11;%r10;%r9) = p_sm2 - P2 */ \ + movq $0xffffffffffffffff, %r9 ; \ + movq %r9, %r11 ; \ + subq P2, %r9 ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq 0x8+P2, %r10 ; \ + sbbq 0x10+P2, %r11 ; \ + movq $0xfffffffeffffffff, %r12 ; \ + sbbq 0x18+P2, %r12 ; \ + /* (%r12;%r11;%r10;%r9;%r8) = D * (p_sm2 - P2) */ \ + movq $D, %rcx ; \ + movq %r9, %rax ; \ + mulq %rcx; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + movq %r10, %rax ; \ + xorl %r10d, %r10d ; \ + mulq %rcx; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq %r11, %rax ; \ + xorl %r11d, %r11d ; \ + mulq %rcx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + movq %r12, %rax ; \ + xorl %r12d, %r12d ; \ + mulq %rcx; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + /* (%rdx;%r11;%r10;%r9;%r8) = 2^256 + C * P1 + D * (p_sm2 - P2) */ \ + movl $C, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbx, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbx, %rbx ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbx, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + leaq 1(%r12), %rdx ; \ + /* Now the tail for modular reduction from tripling */ \ + movq %rdx, %rax ; \ + shlq $0x20, %rax ; \ + movq %rax, %rcx ; \ + subq %rdx, %rax ; \ + addq %rdx, %r8 ; \ + adcq %rax, %r9 ; \ + adcq $0x0, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rdx, %rdx ; \ + notq %rdx; \ + movq $0xffffffff00000000, %rax ; \ + andq %rdx, %rax ; \ + movq %rdx, %rcx ; \ + btr $0x20, %rcx ; \ + addq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq %rdx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 0x18+P0 + +// P0 = 3 * P1 - 8 * P2, computed as (p_sm2 - P2) << 3 + 3 * P1 + +#define cmsub38_sm2(P0,P1,P2) \ + /* First (%r11;%r10;%r9;%r8) = p_sm2 - P2 */ \ + movq $0xffffffffffffffff, %r8 ; \ + movq %r8, %r10 ; \ + subq P2, %r8 ; \ + movq $0xffffffff00000000, %r9 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + movq $0xfffffffeffffffff, %r11 ; \ + sbbq 0x18+P2, %r11 ; \ + /* (%r12;%r11;%r10;%r9;%r8) = (p_sm2 - P2) << 3 */ \ + movq %r11, %r12 ; \ + shldq $3, %r10, %r11 ; \ + shldq $3, %r9, %r10 ; \ + shldq $3, %r8, %r9 ; \ + shlq $3, %r8 ; \ + shrq $61, %r12 ; \ + /* (%rdx;%r11;%r10;%r9;%r8) = 2^256 + 3 * P1 + 8 * (p_sm2 - P2) */ \ + movl $3, %ecx ; \ + movq P1, %rax ; \ + mulq %rcx; \ + addq %rax, %r8 ; \ + adcq %rdx, %r9 ; \ + sbbq %rbx, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + sbbq %rbx, %rbx ; \ + movq 0x10+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rbx, %rbx ; \ + movq 0x18+P1, %rax ; \ + mulq %rcx; \ + subq %rbx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + leaq 1(%r12), %rdx ; \ + /* Now the tail for modular reduction from tripling */ \ + movq %rdx, %rax ; \ + shlq $0x20, %rax ; \ + movq %rax, %rcx ; \ + subq %rdx, %rax ; \ + addq %rdx, %r8 ; \ + adcq %rax, %r9 ; \ + adcq $0x0, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rdx, %rdx ; \ + notq %rdx; \ + movq $0xffffffff00000000, %rax ; \ + andq %rdx, %rax ; \ + movq %rdx, %rcx ; \ + btr $0x20, %rcx ; \ + addq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq %rdx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 0x18+P0 + +// P0 = 4 * P1 - P2, by direct subtraction of P2, +// since the quotient estimate still works safely +// for initial value > -p_sm2 + +#define cmsub41_sm2(P0,P1,P2) \ + movq 0x18+P1, %r11 ; \ + movq %r11, %rdx ; \ + movq 0x10+P1, %r10 ; \ + shldq $2, %r10, %r11 ; \ + movq 0x8+P1, %r9 ; \ + shldq $2, %r9, %r10 ; \ + movq P1, %r8 ; \ + shldq $2, %r8, %r9 ; \ + shlq $2, %r8 ; \ + shrq $62, %rdx ; \ + addq $1, %rdx ; \ + subq P2, %r8 ; \ + sbbq 0x8+P2, %r9 ; \ + sbbq 0x10+P2, %r10 ; \ + sbbq 0x18+P2, %r11 ; \ + sbbq $0, %rdx ; \ + /* Now the tail for modular reduction from tripling */ \ + movq %rdx, %rax ; \ + shlq $0x20, %rax ; \ + movq %rax, %rcx ; \ + subq %rdx, %rax ; \ + addq %rdx, %r8 ; \ + adcq %rax, %r9 ; \ + adcq $0x0, %r10 ; \ + adcq %rcx, %r11 ; \ + sbbq %rdx, %rdx ; \ + notq %rdx; \ + movq $0xffffffff00000000, %rax ; \ + andq %rdx, %rax ; \ + movq %rdx, %rcx ; \ + btr $0x20, %rcx ; \ + addq %rdx, %r8 ; \ + movq %r8, P0 ; \ + adcq %rax, %r9 ; \ + movq %r9, 0x8+P0 ; \ + adcq %rdx, %r10 ; \ + movq %r10, 0x10+P0 ; \ + adcq %rcx, %r11 ; \ + movq %r11, 0x18+P0 + +S2N_BN_SYMBOL(sm2_montjdouble_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room on stack for temporary variables + + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Main code, just a sequence of basic field operations + +// z2 = z^2 +// y2 = y^2 + + montsqr_sm2(z2,z_1) + montsqr_sm2(y2,y_1) + +// x2p = x^2 - z^4 = (x + z^2) * (x - z^2) + + sub_sm2(t2,x_1,z2) + weakadd_sm2(t1,x_1,z2) + montmul_sm2(x2p,t1,t2) + +// t1 = y + z +// xy2 = x * y^2 +// x4p = x2p^2 + + add_sm2(t1,y_1,z_1) + montmul_sm2(xy2,x_1,y2) + montsqr_sm2(x4p,x2p) + +// t1 = (y + z)^2 + + montsqr_sm2(t1,t1) + +// d = 12 * xy2 - 9 * x4p +// t1 = y^2 + 2 * y * z + + cmsub_sm2(d,12,xy2,9,x4p) + sub_sm2(t1,t1,z2) + +// y4 = y^4 + + montsqr_sm2(y4,y2) + +// dx2 = d * x2p + + montmul_sm2(dx2,d,x2p) + +// z_3' = 2 * y * z + + sub_sm2(z_3,t1,y2) + +// x' = 4 * xy2 - d + + cmsub41_sm2(x_3,xy2,d) + +// y' = 3 * dx2 - 8 * y4 + + cmsub38_sm2(y_3,dx2,y4) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjmixadd.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjmixadd.S new file mode 100644 index 00000000000..48de1f997d1 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjmixadd.S @@ -0,0 +1,594 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates +// +// extern void sm2_montjmixadd +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjmixadd) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjmixadd) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rbp = p2, +// which needs to be set up explicitly before use. +// By design, none of the code macros modify any of +// these, so we maintain the assignments throughout. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define NSPACE (NUMSIZE*6) + +// Corresponds to bignum_montmul_sm2 except for registers + +#define montmul_sm2(P0,P1,P2) \ + xorl %ecx, %ecx ; \ + movq P2, %rdx ; \ + mulxq P1, %r8, %r9 ; \ + mulxq 0x8+P1, %rax, %r10 ; \ + addq %rax, %r9 ; \ + mulxq 0x10+P1, %rax, %r11 ; \ + adcq %rax, %r10 ; \ + mulxq 0x18+P1, %rax, %r12 ; \ + adcq %rax, %r11 ; \ + adcq %rcx, %r12 ; \ + xorl %ecx, %ecx ; \ + movq 0x8+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r9 ; \ + adoxq %rbx, %r10 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x18+P1, %rax, %r13 ; \ + adcxq %rax, %r12 ; \ + adoxq %rcx, %r13 ; \ + adcxq %rcx, %r13 ; \ + xorl %ecx, %ecx ; \ + movq 0x10+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x18+P1, %rax, %r14 ; \ + adcxq %rax, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcxq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + movq 0x18+P2, %rdx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + mulxq 0x10+P1, %rax, %rbx ; \ + adcxq %rax, %r13 ; \ + adoxq %rbx, %r14 ; \ + mulxq 0x18+P1, %rax, %r15 ; \ + adcxq %rax, %r14 ; \ + adoxq %rcx, %r15 ; \ + adcxq %rcx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + xorl %eax, %eax ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + adcq %rax, %rax ; \ + movl $0x1, %ecx ; \ + movl $0xffffffff, %edx ; \ + xorl %ebx, %ebx ; \ + addq %r12, %rcx ; \ + leaq 0x1(%rdx), %r11 ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbx), %r8 ; \ + adcq %r14, %rbx ; \ + adcq %r15, %r11 ; \ + adcq %rax, %r8 ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds to bignum_montsqr_sm2 except for registers + +#define montsqr_sm2(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + xorl %eax, %eax ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + adcq %rax, %rax ; \ + movl $0x1, %ecx ; \ + movl $0xffffffff, %edx ; \ + xorl %ebx, %ebx ; \ + addq %r12, %rcx ; \ + leaq 0x1(%rdx), %r11 ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbx), %r8 ; \ + adcq %r14, %rbx ; \ + adcq %r15, %r11 ; \ + adcq %rax, %r8 ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Almost-Montgomery variant which we use when an input to other muls +// with the other argument fully reduced (which is always safe). + +#define amontsqr_sm2(P0,P1) \ + movq P1, %rdx ; \ + mulxq %rdx, %r8, %r15 ; \ + mulxq 0x8+P1, %r9, %r10 ; \ + mulxq 0x18+P1, %r11, %r12 ; \ + movq 0x10+P1, %rdx ; \ + mulxq 0x18+P1, %r13, %r14 ; \ + xorl %ecx, %ecx ; \ + mulxq P1, %rax, %rbx ; \ + adcxq %rax, %r10 ; \ + adoxq %rbx, %r11 ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r11 ; \ + adoxq %rbx, %r12 ; \ + movq 0x18+P1, %rdx ; \ + mulxq 0x8+P1, %rax, %rbx ; \ + adcxq %rax, %r12 ; \ + adoxq %rbx, %r13 ; \ + adcxq %rcx, %r13 ; \ + adoxq %rcx, %r14 ; \ + adcq %rcx, %r14 ; \ + xorl %ecx, %ecx ; \ + adcxq %r9, %r9 ; \ + adoxq %r15, %r9 ; \ + movq 0x8+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r10, %r10 ; \ + adoxq %rax, %r10 ; \ + adcxq %r11, %r11 ; \ + adoxq %rdx, %r11 ; \ + movq 0x10+P1, %rdx ; \ + mulxq %rdx, %rax, %rdx ; \ + adcxq %r12, %r12 ; \ + adoxq %rax, %r12 ; \ + adcxq %r13, %r13 ; \ + adoxq %rdx, %r13 ; \ + movq 0x18+P1, %rdx ; \ + mulxq %rdx, %rax, %r15 ; \ + adcxq %r14, %r14 ; \ + adoxq %rax, %r14 ; \ + adcxq %rcx, %r15 ; \ + adoxq %rcx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + sbbq %rax, %rax ; \ + movq $0xffffffff00000000, %rbx ; \ + movq %rax, %rcx ; \ + andq %rax, %rbx ; \ + btr $32, %rcx ; \ + subq %rax, %r12 ; \ + sbbq %rbx, %r13 ; \ + sbbq %rax, %r14 ; \ + sbbq %rcx, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_sub_sm2 + +#define sub_sm2(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + sbbq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq %r11, %r11 ; \ + andq %r11, %r10 ; \ + movq %r11, %rdx ; \ + btr $0x20, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq %r11, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// Additional macros to help with final multiplexing + +#define testzero4(P) \ + movq P, %rax ; \ + movq 8+P, %rdx ; \ + orq 16+P, %rax ; \ + orq 24+P, %rdx ; \ + orq %rdx, %rax + +#define mux4(r0,r1,r2,r3,PNE,PEQ) \ + movq PNE, r0 ; \ + movq PEQ, %rax ; \ + cmovzq %rax, r0 ; \ + movq 8+PNE, r1 ; \ + movq 8+PEQ, %rax ; \ + cmovzq %rax, r1 ; \ + movq 16+PNE, r2 ; \ + movq 16+PEQ, %rax ; \ + cmovzq %rax, r2 ; \ + movq 24+PNE, r3 ; \ + movq 24+PEQ, %rax ; \ + cmovzq %rax, r3 + +#define load4(r0,r1,r2,r3,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 + +#define store4(P,r0,r1,r2,r3) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P + +S2N_BN_SYMBOL(sm2_montjmixadd): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input y in %rbp where it lasts throughout the main code. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdx, %rbp + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + amontsqr_sm2(zp2,z_1) + + montmul_sm2(y2a,z_1,y_2) + montmul_sm2(x2a,zp2,x_2) + montmul_sm2(y2a,zp2,y2a) + + sub_sm2(xd,x2a,x_1) + + sub_sm2(yd,y2a,y_1) + + amontsqr_sm2(zz,xd) + montsqr_sm2(ww,yd) + + montmul_sm2(zzx1,zz,x_1) + montmul_sm2(zzx2,zz,x2a) + + sub_sm2(resx,ww,zzx1) + sub_sm2(t1,zzx2,zzx1) + + montmul_sm2(resz,xd,z_1) + + sub_sm2(resx,resx,zzx2) + + sub_sm2(t2,zzx1,resx) + + montmul_sm2(t1,t1,y_1) + montmul_sm2(t2,yd,t2) + + sub_sm2(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + testzero4(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^256 - p_sm2), +// hence giving 0 + p2 = p2 for the final result. + + mux4(%r8,%r9,%r10,%r11,resx,x_2) + mux4(%r12,%r13,%r14,%r15,resy,y_2) + + store4(x_3,%r8,%r9,%r10,%r11) + store4(y_3,%r12,%r13,%r14,%r15) + + load4(%r8,%r9,%r10,%r11,resz) + movl $1, %eax + cmovzq %rax, %r8 + movl $0x00000000ffffffff, %eax + cmovzq %rax, %r9 + movl $0, %eax + cmovzq %rax, %r10 + movq $0x0000000100000000, %rax + cmovzq %rax, %r11 + + store4(z_3,%r8,%r9,%r10,%r11) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjmixadd_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjmixadd_alt.S new file mode 100644 index 00000000000..74e5c7d45c2 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjmixadd_alt.S @@ -0,0 +1,533 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Point mixed addition on GM/T 0003-2012 curve SM2 in Montgomery-Jacobian coordinates +// +// extern void sm2_montjmixadd_alt +// (uint64_t p3[static 12],uint64_t p1[static 12],uint64_t p2[static 8]); +// +// Does p3 := p1 + p2 where all points are regarded as Jacobian triples with +// each coordinate in the Montgomery domain, i.e. x' = (2^256 * x) mod p_sm2. +// A Jacobian triple (x',y',z') represents affine point (x/z^2,y/z^3). +// The "mixed" part means that p2 only has x and y coordinates, with the +// implicit z coordinate assumed to be the identity. +// +// Standard x86-64 ABI: RDI = p3, RSI = p1, RDX = p2 +// Microsoft x64 ABI: RCX = p3, RDX = p1, R8 = p2 +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjmixadd_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjmixadd_alt) + .text + +// Size of individual field elements + +#define NUMSIZE 32 + +// Pointer-offset pairs for inputs and outputs +// These assume %rdi = p3, %rsi = p1 and %rbp = p2, +// which needs to be set up explicitly before use. +// By design, none of the code macros modify any of +// these, so we maintain the assignments throughout. + +#define x_1 0(%rsi) +#define y_1 NUMSIZE(%rsi) +#define z_1 (2*NUMSIZE)(%rsi) + +#define x_2 0(%rbp) +#define y_2 NUMSIZE(%rbp) + +#define x_3 0(%rdi) +#define y_3 NUMSIZE(%rdi) +#define z_3 (2*NUMSIZE)(%rdi) + +// Pointer-offset pairs for temporaries, with some aliasing +// NSPACE is the total stack needed for these temporaries + +#define zp2 (NUMSIZE*0)(%rsp) +#define ww (NUMSIZE*0)(%rsp) +#define resx (NUMSIZE*0)(%rsp) + +#define yd (NUMSIZE*1)(%rsp) +#define y2a (NUMSIZE*1)(%rsp) + +#define x2a (NUMSIZE*2)(%rsp) +#define zzx2 (NUMSIZE*2)(%rsp) + +#define zz (NUMSIZE*3)(%rsp) +#define t1 (NUMSIZE*3)(%rsp) + +#define t2 (NUMSIZE*4)(%rsp) +#define zzx1 (NUMSIZE*4)(%rsp) +#define resy (NUMSIZE*4)(%rsp) + +#define xd (NUMSIZE*5)(%rsp) +#define resz (NUMSIZE*5)(%rsp) + +#define NSPACE (NUMSIZE*6) + +// Corresponds to bignum_montmul_sm2_alt except for registers + +#define montmul_sm2(P0,P1,P2) \ + movq P1, %rax ; \ + mulq P2; \ + movq %rax, %r8 ; \ + movq %rdx, %r9 ; \ + xorq %r10, %r10 ; \ + xorq %r11, %r11 ; \ + movq P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + movq 0x8+P1, %rax ; \ + mulq P2; \ + addq %rax, %r9 ; \ + adcq %rdx, %r10 ; \ + adcq %r11, %r11 ; \ + xorq %r12, %r12 ; \ + movq P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq %r12, %r12 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + movq 0x10+P1, %rax ; \ + mulq P2; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + adcq $0x0, %r12 ; \ + xorq %r13, %r13 ; \ + movq P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq %r13, %r13 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + movq 0x18+P1, %rax ; \ + mulq P2; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + adcq $0x0, %r13 ; \ + xorq %r14, %r14 ; \ + movq 0x8+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq %r14, %r14 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x8+P2; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x10+P2; \ + addq %rax, %r13 ; \ + adcq %rdx, %r14 ; \ + adcq $0x0, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq 0x18+P2; \ + addq %rax, %r14 ; \ + adcq %rdx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + xorl %eax, %eax ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + adcq %rax, %rax ; \ + movl $0x1, %ecx ; \ + movl $0xffffffff, %edx ; \ + xorl %ebx, %ebx ; \ + addq %r12, %rcx ; \ + leaq 0x1(%rdx), %r11 ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbx), %r8 ; \ + adcq %r14, %rbx ; \ + adcq %r15, %r11 ; \ + adcq %rax, %r8 ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds to bignum_montsqr_sm2_alt except for registers + +#define montsqr_sm2(P0,P1) \ + movq P1, %rax ; \ + movq %rax, %rbx ; \ + mulq %rax; \ + movq %rax, %r8 ; \ + movq %rdx, %r15 ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + movq %rax, %r9 ; \ + movq %rdx, %r10 ; \ + movq 0x18+P1, %rax ; \ + movq %rax, %r13 ; \ + mulq %rbx; \ + movq %rax, %r11 ; \ + movq %rdx, %r12 ; \ + movq 0x10+P1, %rax ; \ + movq %rax, %rbx ; \ + mulq %r13; \ + movq %rax, %r13 ; \ + movq %rdx, %r14 ; \ + movq P1, %rax ; \ + mulq %rbx; \ + addq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %rcx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ + sbbq %rcx, %rcx ; \ + movq 0x18+P1, %rbx ; \ + movq 0x8+P1, %rax ; \ + mulq %rbx; \ + subq %rcx, %rdx ; \ + addq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + adcq $0x0, %r14 ; \ + xorl %ecx, %ecx ; \ + addq %r9, %r9 ; \ + adcq %r10, %r10 ; \ + adcq %r11, %r11 ; \ + adcq %r12, %r12 ; \ + adcq %r13, %r13 ; \ + adcq %r14, %r14 ; \ + adcq %rcx, %rcx ; \ + movq 0x8+P1, %rax ; \ + mulq %rax; \ + addq %r15, %r9 ; \ + adcq %rax, %r10 ; \ + adcq %rdx, %r11 ; \ + sbbq %r15, %r15 ; \ + movq 0x10+P1, %rax ; \ + mulq %rax; \ + negq %r15; \ + adcq %rax, %r12 ; \ + adcq %rdx, %r13 ; \ + sbbq %r15, %r15 ; \ + movq 0x18+P1, %rax ; \ + mulq %rax; \ + negq %r15; \ + adcq %rax, %r14 ; \ + adcq %rcx, %rdx ; \ + movq %rdx, %r15 ; \ + movq %r8, %rax ; \ + shlq $0x20, %rax ; \ + movq %r8, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r8, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r9 ; \ + sbbq %rcx, %r10 ; \ + sbbq %rdx, %r11 ; \ + sbbq %rbx, %r8 ; \ + movq %r9, %rax ; \ + shlq $0x20, %rax ; \ + movq %r9, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r9, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r10 ; \ + sbbq %rcx, %r11 ; \ + sbbq %rdx, %r8 ; \ + sbbq %rbx, %r9 ; \ + movq %r10, %rax ; \ + shlq $0x20, %rax ; \ + movq %r10, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r10, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r11 ; \ + sbbq %rcx, %r8 ; \ + sbbq %rdx, %r9 ; \ + sbbq %rbx, %r10 ; \ + movq %r11, %rax ; \ + shlq $0x20, %rax ; \ + movq %r11, %rcx ; \ + shrq $0x20, %rcx ; \ + movq %rax, %rdx ; \ + movq %rcx, %rbx ; \ + subq %r11, %rax ; \ + sbbq $0x0, %rcx ; \ + subq %rax, %r8 ; \ + sbbq %rcx, %r9 ; \ + sbbq %rdx, %r10 ; \ + sbbq %rbx, %r11 ; \ + xorl %eax, %eax ; \ + addq %r8, %r12 ; \ + adcq %r9, %r13 ; \ + adcq %r10, %r14 ; \ + adcq %r11, %r15 ; \ + adcq %rax, %rax ; \ + movl $0x1, %ecx ; \ + movl $0xffffffff, %edx ; \ + xorl %ebx, %ebx ; \ + addq %r12, %rcx ; \ + leaq 0x1(%rdx), %r11 ; \ + adcq %r13, %rdx ; \ + leaq -0x1(%rbx), %r8 ; \ + adcq %r14, %rbx ; \ + adcq %r15, %r11 ; \ + adcq %rax, %r8 ; \ + cmovbq %rcx, %r12 ; \ + cmovbq %rdx, %r13 ; \ + cmovbq %rbx, %r14 ; \ + cmovbq %r11, %r15 ; \ + movq %r12, P0 ; \ + movq %r13, 0x8+P0 ; \ + movq %r14, 0x10+P0 ; \ + movq %r15, 0x18+P0 + +// Corresponds exactly to bignum_sub_sm2 + +#define sub_sm2(P0,P1,P2) \ + movq P1, %rax ; \ + subq P2, %rax ; \ + movq 0x8+P1, %rcx ; \ + sbbq 0x8+P2, %rcx ; \ + movq 0x10+P1, %r8 ; \ + sbbq 0x10+P2, %r8 ; \ + movq 0x18+P1, %r9 ; \ + sbbq 0x18+P2, %r9 ; \ + movq $0xffffffff00000000, %r10 ; \ + sbbq %r11, %r11 ; \ + andq %r11, %r10 ; \ + movq %r11, %rdx ; \ + btr $0x20, %rdx ; \ + addq %r11, %rax ; \ + movq %rax, P0 ; \ + adcq %r10, %rcx ; \ + movq %rcx, 0x8+P0 ; \ + adcq %r11, %r8 ; \ + movq %r8, 0x10+P0 ; \ + adcq %rdx, %r9 ; \ + movq %r9, 0x18+P0 + +// Additional macros to help with final multiplexing + +#define testzero4(P) \ + movq P, %rax ; \ + movq 8+P, %rdx ; \ + orq 16+P, %rax ; \ + orq 24+P, %rdx ; \ + orq %rdx, %rax + +#define mux4(r0,r1,r2,r3,PNE,PEQ) \ + movq PNE, r0 ; \ + movq PEQ, %rax ; \ + cmovzq %rax, r0 ; \ + movq 8+PNE, r1 ; \ + movq 8+PEQ, %rax ; \ + cmovzq %rax, r1 ; \ + movq 16+PNE, r2 ; \ + movq 16+PEQ, %rax ; \ + cmovzq %rax, r2 ; \ + movq 24+PNE, r3 ; \ + movq 24+PEQ, %rax ; \ + cmovzq %rax, r3 + +#define load4(r0,r1,r2,r3,P) \ + movq P, r0 ; \ + movq 8+P, r1 ; \ + movq 16+P, r2 ; \ + movq 24+P, r3 + +#define store4(P,r0,r1,r2,r3) \ + movq r0, P ; \ + movq r1, 8+P ; \ + movq r2, 16+P ; \ + movq r3, 24+P + +S2N_BN_SYMBOL(sm2_montjmixadd_alt): + _CET_ENDBR + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + +// Save registers and make room on stack for temporary variables +// Put the input y in %rbp where it lasts throughout the main code. + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + + movq %rdx, %rbp + +// Main code, just a sequence of basic field operations +// 8 * multiply + 3 * square + 7 * subtract + + montsqr_sm2(zp2,z_1) + + montmul_sm2(y2a,z_1,y_2) + montmul_sm2(x2a,zp2,x_2) + montmul_sm2(y2a,zp2,y2a) + + sub_sm2(xd,x2a,x_1) + + sub_sm2(yd,y2a,y_1) + + montsqr_sm2(zz,xd) + montsqr_sm2(ww,yd) + + montmul_sm2(zzx1,zz,x_1) + montmul_sm2(zzx2,zz,x2a) + + sub_sm2(resx,ww,zzx1) + sub_sm2(t1,zzx2,zzx1) + + montmul_sm2(resz,xd,z_1) + + sub_sm2(resx,resx,zzx2) + + sub_sm2(t2,zzx1,resx) + + montmul_sm2(t1,t1,y_1) + montmul_sm2(t2,yd,t2) + + sub_sm2(resy,t2,t1) + +// Test if z_1 = 0 to decide if p1 = 0 (up to projective equivalence) + + testzero4(z_1) + +// Multiplex: if p1 <> 0 just copy the computed result from the staging area. +// If p1 = 0 then return the point p2 augmented with a z = 1 coordinate (in +// Montgomery form so not the simple constant 1 but rather 2^256 - p_sm2), +// hence giving 0 + p2 = p2 for the final result. + + mux4(%r8,%r9,%r10,%r11,resx,x_2) + mux4(%r12,%r13,%r14,%r15,resy,y_2) + + store4(x_3,%r8,%r9,%r10,%r11) + store4(y_3,%r12,%r13,%r14,%r15) + + load4(%r8,%r9,%r10,%r11,resz) + movl $1, %eax + cmovzq %rax, %r8 + movl $0x00000000ffffffff, %eax + cmovzq %rax, %r9 + movl $0, %eax + cmovzq %rax, %r10 + movq $0x0000000100000000, %rax + cmovzq %rax, %r11 + + store4(z_3,%r8,%r9,%r10,%r11) + +// Restore stack and registers + + addq $NSPACE, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjscalarmul.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjscalarmul.S new file mode 100644 index 00000000000..de2d11d8d86 --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjscalarmul.S @@ -0,0 +1,3859 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery-Jacobian form scalar multiplication for GM/T 0003-2012 curve SM2 +// Input scalar[4], point[12]; output res[12] +// +// extern void sm2_montjscalarmul +// (uint64_t res[static 12], +// uint64_t scalar[static 4], +// uint64_t point[static 12]); +// +// This function is a variant of its affine point version sm2_scalarmul. +// Here, input and output points are assumed to be in Jacobian form with +// their coordinates in the Montgomery domain. Thus, if priming indicates +// Montgomery form, x' = (2^256 * x) mod p_sm2 etc., each point argument +// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when +// z' is nonzero or the point at infinity (group identity) if z' = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve SM2, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_sm2) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjscalarmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjscalarmul) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Intermediate variables on the stack. Uppercase syntactic variants +// make x86_att version simpler to generate. + +#define SCALARB (0*NUMSIZE) +#define scalarb (0*NUMSIZE)(%rsp) +#define ACC (1*NUMSIZE) +#define acc (1*NUMSIZE)(%rsp) +#define TABENT (4*NUMSIZE) +#define tabent (4*NUMSIZE)(%rsp) + +#define TAB (7*NUMSIZE) +#define tab (7*NUMSIZE)(%rsp) + +#define res (31*NUMSIZE)(%rsp) + +#define NSPACE (32*NUMSIZE) + +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmpq $I, %rdi ; \ + cmovzq TAB+96*(I-1)(%rsp), %rax ; \ + cmovzq TAB+96*(I-1)+8(%rsp), %rbx ; \ + cmovzq TAB+96*(I-1)+16(%rsp), %rcx ; \ + cmovzq TAB+96*(I-1)+24(%rsp), %rdx ; \ + cmovzq TAB+96*(I-1)+32(%rsp), %r8 ; \ + cmovzq TAB+96*(I-1)+40(%rsp), %r9 ; \ + cmovzq TAB+96*(I-1)+48(%rsp), %r10 ; \ + cmovzq TAB+96*(I-1)+56(%rsp), %r11 ; \ + cmovzq TAB+96*(I-1)+64(%rsp), %r12 ; \ + cmovzq TAB+96*(I-1)+72(%rsp), %r13 ; \ + cmovzq TAB+96*(I-1)+80(%rsp), %r14 ; \ + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + +S2N_BN_SYMBOL(sm2_montjscalarmul): + _CET_ENDBR + +// The Windows version literally calls the standard ABI version. +// This simplifies the proofs since subroutine offsets are fixed. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + callq sm2_montjscalarmul_standard + popq %rsi + popq %rdi + ret + +sm2_montjscalarmul_standard: +#endif + +// Real start of the standard ABI code. + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + subq $NSPACE, %rsp + +// Preserve the "res" and "point" input arguments. We load and process the +// scalar immediately so we don't bother preserving that input argument. +// Also, "point" is only needed early on and so its register gets re-used. + + movq %rdx, %rbx + movq %rdi, res + +// Load the digits of group order n_sm2 = [%r15;%r14;%r13;%r12] + + movq $0x53bbf40939d54123, %r12 + movq $0x7203df6b21c6052b, %r13 + movq $0xffffffffffffffff, %r14 + movq $0xfffffffeffffffff, %r15 + +// First, reduce the input scalar mod n_sm2, i.e. conditionally subtract n_sm2 + + movq (%rsi), %r8 + subq %r12, %r8 + movq 8(%rsi), %r9 + sbbq %r13, %r9 + movq 16(%rsi), %r10 + sbbq %r14, %r10 + movq 24(%rsi), %r11 + sbbq %r15, %r11 + + cmovcq (%rsi), %r8 + cmovcq 8(%rsi), %r9 + cmovcq 16(%rsi), %r10 + cmovcq 24(%rsi), %r11 + +// Now if the top bit of the reduced scalar is set, negate it mod n_sm2, +// i.e. do n |-> n_sm2 - n. Remember the sign in %rbp so we can +// correspondingly negate the point below. + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + movq %r11, %rbp + shrq $63, %rbp + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + +// In either case then add the recoding constant 0x08888...888 to allow +// signed digits. + + movq $0x8888888888888888, %rax + addq %rax, %r8 + adcq %rax, %r9 + adcq %rax, %r10 + adcq %rax, %r11 + btc $63, %r11 + + movq %r8, SCALARB(%rsp) + movq %r9, SCALARB+8(%rsp) + movq %r10, SCALARB+16(%rsp) + movq %r11, SCALARB+24(%rsp) + +// Set the tab[0] table entry to the input point = 1 * P, except +// that we negate it if the top bit of the scalar was set. This +// negation takes care over the y = 0 case to maintain all the +// coordinates < p_sm2 throughout, even though triples (x,y,z) +// with y = 0 can only represent a point on the curve when z = 0 +// and it represents the point at infinity regardless of x and y. + + movq (%rbx), %rax + movq %rax, TAB(%rsp) + movq 8(%rbx), %rax + movq %rax, TAB+8(%rsp) + movq 16(%rbx), %rax + movq %rax, TAB+16(%rsp) + movq 24(%rbx), %rax + movq %rax, TAB+24(%rsp) + + movq 32(%rbx), %r12 + movq %r12, %rax + movq 40(%rbx), %r13 + orq %r13, %rax + movq 48(%rbx), %r14 + movq %r14, %rcx + movq 56(%rbx), %r15 + orq %r15, %rcx + orq %rcx, %rax + cmovzq %rax, %rbp + + xorl %r11d, %r11d + movl $0x00000000ffffffff, %r9d + notq %r11 + movq %r11, %r8 + movq %r11, %r10 + xorq %r8, %r9 + btr $32, %r11 + + subq %r12, %r8 + sbbq %r13, %r9 + sbbq %r14, %r10 + sbbq %r15, %r11 + testq %rbp, %rbp + cmovzq %r12, %r8 + cmovzq %r13, %r9 + cmovzq %r14, %r10 + cmovzq %r15, %r11 + movq %r8, TAB+32(%rsp) + movq %r9, TAB+40(%rsp) + movq %r10, TAB+48(%rsp) + movq %r11, TAB+56(%rsp) + + movq 64(%rbx), %rax + movq %rax, TAB+64(%rsp) + movq 72(%rbx), %rax + movq %rax, TAB+72(%rsp) + movq 80(%rbx), %rax + movq %rax, TAB+80(%rsp) + movq 88(%rbx), %rax + movq %rax, TAB+88(%rsp) + +// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P + + leaq TAB+96*1(%rsp), %rdi + leaq TAB(%rsp), %rsi + callq sm2_montjscalarmul_sm2_montjdouble + + leaq TAB+96*2(%rsp), %rdi + leaq TAB+96*1(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq sm2_montjscalarmul_sm2_montjadd + + leaq TAB+96*3(%rsp), %rdi + leaq TAB+96*1(%rsp), %rsi + callq sm2_montjscalarmul_sm2_montjdouble + + leaq TAB+96*4(%rsp), %rdi + leaq TAB+96*3(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq sm2_montjscalarmul_sm2_montjadd + + leaq TAB+96*5(%rsp), %rdi + leaq TAB+96*2(%rsp), %rsi + callq sm2_montjscalarmul_sm2_montjdouble + + leaq TAB+96*6(%rsp), %rdi + leaq TAB+96*5(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq sm2_montjscalarmul_sm2_montjadd + + leaq TAB+96*7(%rsp), %rdi + leaq TAB+96*3(%rsp), %rsi + callq sm2_montjscalarmul_sm2_montjdouble + +// Set up accumulator as table entry for top 4 bits (constant-time indexing) + + movq SCALARB+24(%rsp), %rdi + shrq $60, %rdi + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + + movq %rax, ACC(%rsp) + movq %rbx, ACC+8(%rsp) + movq %rcx, ACC+16(%rsp) + movq %rdx, ACC+24(%rsp) + movq %r8, ACC+32(%rsp) + movq %r9, ACC+40(%rsp) + movq %r10, ACC+48(%rsp) + movq %r11, ACC+56(%rsp) + movq %r12, ACC+64(%rsp) + movq %r13, ACC+72(%rsp) + movq %r14, ACC+80(%rsp) + movq %r15, ACC+88(%rsp) + +// Main loop over size-4 bitfield + + movl $252, %ebp + +sm2_montjscalarmul_mainloop: + subq $4, %rbp + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq sm2_montjscalarmul_sm2_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq sm2_montjscalarmul_sm2_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq sm2_montjscalarmul_sm2_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq sm2_montjscalarmul_sm2_montjdouble + + movq %rbp, %rax + shrq $6, %rax + movq (%rsp,%rax,8), %rdi + movq %rbp, %rcx + shrq %cl, %rdi + andq $15, %rdi + + subq $8, %rdi + sbbq %rsi, %rsi // %rsi = sign of digit (-1 = negative) + xorq %rsi, %rdi + subq %rsi, %rdi // %rdi = absolute value of digit + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + +// Store it to "tabent" with the y coordinate optionally negated +// Again, do it carefully to give coordinates < p_sm2 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + + movq %r12, TABENT+64(%rsp) + movq %r13, TABENT+72(%rsp) + movq %r14, TABENT+80(%rsp) + movq %r15, TABENT+88(%rsp) + + xorl %r15d, %r15d + movq %r8, %rax + movl $0x00000000ffffffff, %r13d + orq %r9, %rax + notq %r15 + movq %r10, %rcx + movq %r15, %r12 + orq %r11, %rcx + movq %r15, %r14 + xorq %r12, %r13 + btr $32, %r15 + orq %rcx, %rax + cmovzq %rax, %rsi + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + testq %rsi, %rsi + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + + movq %r8, TABENT+32(%rsp) + movq %r9, TABENT+40(%rsp) + movq %r10, TABENT+48(%rsp) + movq %r11, TABENT+56(%rsp) + + leaq TABENT(%rsp), %rdx + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq sm2_montjscalarmul_sm2_montjadd + + testq %rbp, %rbp + jne sm2_montjscalarmul_mainloop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + movq res, %rdi + movq ACC(%rsp), %rax + movq %rax, (%rdi) + movq ACC+8(%rsp), %rax + movq %rax, 8(%rdi) + movq ACC+16(%rsp), %rax + movq %rax, 16(%rdi) + movq ACC+24(%rsp), %rax + movq %rax, 24(%rdi) + + movq ACC+32(%rsp), %rax + movq %rax, 32(%rdi) + movq ACC+40(%rsp), %rax + movq %rax, 40(%rdi) + movq ACC+48(%rsp), %rax + movq %rax, 48(%rdi) + movq ACC+56(%rsp), %rax + movq %rax, 56(%rdi) + + movq ACC+64(%rsp), %rax + movq %rax, 64(%rdi) + movq ACC+72(%rsp), %rax + movq %rax, 72(%rdi) + movq ACC+80(%rsp), %rax + movq %rax, 80(%rdi) + movq ACC+88(%rsp), %rax + movq %rax, 88(%rdi) + +// Restore stack and registers and return + + addq $NSPACE, %rsp + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + ret + +// Local copies of subroutines, complete clones at the moment + +sm2_montjscalarmul_sm2_montjadd: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xe0, %rsp + movq %rdx, %rbp + movq 0x40(%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rsi), %r9, %r10 + mulxq 0x58(%rsi), %r11, %r12 + movq 0x50(%rsi), %rdx + mulxq 0x58(%rsi), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rsi), %rdx + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + sbbq %rax, %rax + movabsq $0xffffffff00000000, %rbx + movq %rax, %rcx + andq %rax, %rbx + btr $0x20, %rcx + subq %rax, %r12 + sbbq %rbx, %r13 + sbbq %rax, %r14 + sbbq %rcx, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x40(%rbp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rbp), %r9, %r10 + mulxq 0x58(%rbp), %r11, %r12 + movq 0x50(%rbp), %rdx + mulxq 0x58(%rbp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x40(%rbp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rbp), %rdx + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rbp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rbp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rbp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + sbbq %rax, %rax + movabsq $0xffffffff00000000, %rbx + movq %rax, %rcx + andq %rax, %rbx + btr $0x20, %rcx + subq %rax, %r12 + sbbq %rbx, %r13 + sbbq %rax, %r14 + sbbq %rcx, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + xorl %ecx, %ecx + movq 0x20(%rsi), %rdx + mulxq 0x40(%rbp), %r8, %r9 + mulxq 0x48(%rbp), %rax, %r10 + addq %rax, %r9 + mulxq 0x50(%rbp), %rax, %r11 + adcq %rax, %r10 + mulxq 0x58(%rbp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x28(%rsi), %rdx + mulxq 0x40(%rbp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x50(%rbp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x58(%rbp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x30(%rsi), %rdx + mulxq 0x40(%rbp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x50(%rbp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x58(%rbp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x38(%rsi), %rdx + mulxq 0x40(%rbp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x50(%rbp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x58(%rbp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0xc0(%rsp) + movq %r13, 0xc8(%rsp) + movq %r14, 0xd0(%rsp) + movq %r15, 0xd8(%rsp) + xorl %ecx, %ecx + movq 0x20(%rbp), %rdx + mulxq 0x40(%rsi), %r8, %r9 + mulxq 0x48(%rsi), %rax, %r10 + addq %rax, %r9 + mulxq 0x50(%rsi), %rax, %r11 + adcq %rax, %r10 + mulxq 0x58(%rsi), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x28(%rbp), %rdx + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x58(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x30(%rbp), %rdx + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x58(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x38(%rbp), %rdx + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x58(%rsi), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + xorl %ecx, %ecx + movq 0x0(%rbp), %rdx + mulxq (%rsp), %r8, %r9 + mulxq 0x8(%rsp), %rax, %r10 + addq %rax, %r9 + mulxq 0x10(%rsp), %rax, %r11 + adcq %rax, %r10 + mulxq 0x18(%rsp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x8(%rbp), %rdx + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x10(%rbp), %rdx + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(%rsp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x18(%rbp), %rdx + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x18(%rsp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + xorl %ecx, %ecx + movq (%rsi), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rax, %r10 + addq %rax, %r9 + mulxq 0xb0(%rsp), %rax, %r11 + adcq %rax, %r10 + mulxq 0xb8(%rsp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x8(%rsi), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x10(%rsi), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb8(%rsp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x18(%rsi), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0xb8(%rsp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + xorl %ecx, %ecx + movq 0x20(%rsp), %rdx + mulxq (%rsp), %r8, %r9 + mulxq 0x8(%rsp), %rax, %r10 + addq %rax, %r9 + mulxq 0x10(%rsp), %rax, %r11 + adcq %rax, %r10 + mulxq 0x18(%rsp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x28(%rsp), %rdx + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x30(%rsp), %rdx + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(%rsp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x38(%rsp), %rdx + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x18(%rsp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + xorl %ecx, %ecx + movq 0xc0(%rsp), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rax, %r10 + addq %rax, %r9 + mulxq 0xb0(%rsp), %rax, %r11 + adcq %rax, %r10 + mulxq 0xb8(%rsp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0xc8(%rsp), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0xd0(%rsp), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb8(%rsp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0xd8(%rsp), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0xb8(%rsp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0xc0(%rsp) + movq %r13, 0xc8(%rsp) + movq %r14, 0xd0(%rsp) + movq %r15, 0xd8(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0xa0(%rsp) + adcq %r10, %rcx + movq %rcx, 0xa8(%rsp) + adcq %r11, %r8 + movq %r8, 0xb0(%rsp) + adcq %rdx, %r9 + movq %r9, 0xb8(%rsp) + movq 0x20(%rsp), %rax + subq 0xc0(%rsp), %rax + movq 0x28(%rsp), %rcx + sbbq 0xc8(%rsp), %rcx + movq 0x30(%rsp), %r8 + sbbq 0xd0(%rsp), %r8 + movq 0x38(%rsp), %r9 + sbbq 0xd8(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x20(%rsp) + adcq %r10, %rcx + movq %rcx, 0x28(%rsp) + adcq %r11, %r8 + movq %r8, 0x30(%rsp) + adcq %rdx, %r9 + movq %r9, 0x38(%rsp) + movq 0xa0(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0xa8(%rsp), %r9, %r10 + mulxq 0xb8(%rsp), %r11, %r12 + movq 0xb0(%rsp), %rdx + mulxq 0xb8(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0xb8(%rsp), %rdx + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0xa8(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0xb0(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0xb8(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + sbbq %rax, %rax + movabsq $0xffffffff00000000, %rbx + movq %rax, %rcx + andq %rax, %rbx + btr $0x20, %rcx + subq %rax, %r12 + sbbq %rbx, %r13 + sbbq %rax, %r14 + sbbq %rcx, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x20(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x28(%rsp), %r9, %r10 + mulxq 0x38(%rsp), %r11, %r12 + movq 0x30(%rsp), %rdx + mulxq 0x38(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x38(%rsp), %rdx + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x28(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x30(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x38(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + xorl %ecx, %ecx + movq 0x80(%rsp), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rax, %r10 + addq %rax, %r9 + mulxq 0x70(%rsp), %rax, %r11 + adcq %rax, %r10 + mulxq 0x78(%rsp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x88(%rsp), %rdx + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x90(%rsp), %rdx + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x78(%rsp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x98(%rsp), %rdx + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x78(%rsp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + xorl %ecx, %ecx + movq 0x40(%rsp), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rax, %r10 + addq %rax, %r9 + mulxq 0x70(%rsp), %rax, %r11 + adcq %rax, %r10 + mulxq 0x78(%rsp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x48(%rsp), %rdx + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x50(%rsp), %rdx + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x78(%rsp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x58(%rsp), %rdx + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x78(%rsp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq (%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq %r11, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq %r11, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + xorl %ecx, %ecx + movq 0x40(%rsi), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rax, %r10 + addq %rax, %r9 + mulxq 0xb0(%rsp), %rax, %r11 + adcq %rax, %r10 + mulxq 0xb8(%rsp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x48(%rsi), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x50(%rsi), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb8(%rsp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x58(%rsi), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0xb8(%rsp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq (%rsp), %rax + subq 0x40(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x48(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x50(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x58(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq %r11, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x80(%rsp), %rax + subq (%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq %r11, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + xorl %ecx, %ecx + movq 0xc0(%rsp), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rax, %r10 + addq %rax, %r9 + mulxq 0x70(%rsp), %rax, %r11 + adcq %rax, %r10 + mulxq 0x78(%rsp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0xc8(%rsp), %rdx + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0xd0(%rsp), %rdx + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x78(%rsp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0xd8(%rsp), %rdx + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x78(%rsp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + xorl %ecx, %ecx + movq 0x40(%rbp), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rax, %r10 + addq %rax, %r9 + mulxq 0xb0(%rsp), %rax, %r11 + adcq %rax, %r10 + mulxq 0xb8(%rsp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x48(%rbp), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x50(%rbp), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb8(%rsp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x58(%rbp), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0xb8(%rsp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + xorl %ecx, %ecx + movq 0x80(%rsp), %rdx + mulxq 0x20(%rsp), %r8, %r9 + mulxq 0x28(%rsp), %rax, %r10 + addq %rax, %r9 + mulxq 0x30(%rsp), %rax, %r11 + adcq %rax, %r10 + mulxq 0x38(%rsp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x88(%rsp), %rdx + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x38(%rsp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x90(%rsp), %rdx + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x38(%rsp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x98(%rsp), %rdx + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x38(%rsp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x80(%rsp), %rax + subq 0x60(%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x68(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x70(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x78(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq %r11, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x40(%rsi), %r8 + movq 0x48(%rsi), %r9 + movq 0x50(%rsi), %r10 + movq 0x58(%rsi), %r11 + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + movq 0x40(%rbp), %r12 + movq 0x48(%rbp), %r13 + movq 0x50(%rbp), %r14 + movq 0x58(%rbp), %r15 + movq %r12, %rbx + movq %r13, %rdx + orq %r14, %rbx + orq %r15, %rdx + orq %rdx, %rbx + negq %rbx + sbbq %rbx, %rbx + cmpq %rax, %rbx + cmovbq %r8, %r12 + cmovbq %r9, %r13 + cmovbq %r10, %r14 + cmovbq %r11, %r15 + cmoveq 0xa0(%rsp), %r12 + cmoveq 0xa8(%rsp), %r13 + cmoveq 0xb0(%rsp), %r14 + cmoveq 0xb8(%rsp), %r15 + movq (%rsp), %rax + cmovbq (%rsi), %rax + cmova 0x0(%rbp), %rax + movq 0x8(%rsp), %rbx + cmovbq 0x8(%rsi), %rbx + cmova 0x8(%rbp), %rbx + movq 0x10(%rsp), %rcx + cmovbq 0x10(%rsi), %rcx + cmova 0x10(%rbp), %rcx + movq 0x18(%rsp), %rdx + cmovbq 0x18(%rsi), %rdx + cmova 0x18(%rbp), %rdx + movq 0x80(%rsp), %r8 + cmovbq 0x20(%rsi), %r8 + cmova 0x20(%rbp), %r8 + movq 0x88(%rsp), %r9 + cmovbq 0x28(%rsi), %r9 + cmova 0x28(%rbp), %r9 + movq 0x90(%rsp), %r10 + cmovbq 0x30(%rsi), %r10 + cmova 0x30(%rbp), %r10 + movq 0x98(%rsp), %r11 + cmovbq 0x38(%rsi), %r11 + cmova 0x38(%rbp), %r11 + movq %rax, (%rdi) + movq %rbx, 0x8(%rdi) + movq %rcx, 0x10(%rdi) + movq %rdx, 0x18(%rdi) + movq %r8, 0x20(%rdi) + movq %r9, 0x28(%rdi) + movq %r10, 0x30(%rdi) + movq %r11, 0x38(%rdi) + movq %r12, 0x40(%rdi) + movq %r13, 0x48(%rdi) + movq %r14, 0x50(%rdi) + movq %r15, 0x58(%rdi) + addq $0xe0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +sm2_montjscalarmul_sm2_montjdouble: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xc0, %rsp + movq 0x40(%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rsi), %r9, %r10 + mulxq 0x58(%rsi), %r11, %r12 + movq 0x50(%rsi), %rdx + mulxq 0x58(%rsi), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rsi), %rdx + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x20(%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x28(%rsi), %r9, %r10 + mulxq 0x38(%rsi), %r11, %r12 + movq 0x30(%rsi), %rdx + mulxq 0x38(%rsi), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x38(%rsi), %rdx + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x28(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x30(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x38(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq (%rsi), %rax + subq (%rsp), %rax + movq 0x8(%rsi), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x10(%rsi), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x18(%rsi), %r9 + sbbq 0x18(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq %r11, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + movq (%rsi), %rax + addq (%rsp), %rax + movq 0x8(%rsi), %rcx + adcq 0x8(%rsp), %rcx + movq 0x10(%rsi), %r8 + adcq 0x10(%rsp), %r8 + movq 0x18(%rsi), %r9 + adcq 0x18(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + subq %r11, %rax + movq %rax, 0x40(%rsp) + sbbq %r10, %rcx + movq %rcx, 0x48(%rsp) + sbbq %r11, %r8 + movq %r8, 0x50(%rsp) + sbbq %rdx, %r9 + movq %r9, 0x58(%rsp) + xorl %ecx, %ecx + movq 0x60(%rsp), %rdx + mulxq 0x40(%rsp), %r8, %r9 + mulxq 0x48(%rsp), %rax, %r10 + addq %rax, %r9 + mulxq 0x50(%rsp), %rax, %r11 + adcq %rax, %r10 + mulxq 0x58(%rsp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x68(%rsp), %rdx + mulxq 0x40(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x50(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x58(%rsp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x70(%rsp), %rdx + mulxq 0x40(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x50(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x58(%rsp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x78(%rsp), %rdx + mulxq 0x40(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x50(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x58(%rsp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + xorq %r11, %r11 + movq 0x20(%rsi), %rax + addq 0x40(%rsi), %rax + movq 0x28(%rsi), %rcx + adcq 0x48(%rsi), %rcx + movq 0x30(%rsi), %r8 + adcq 0x50(%rsi), %r8 + movq 0x38(%rsi), %r9 + adcq 0x58(%rsi), %r9 + adcq %r11, %r11 + subq $0xffffffffffffffff, %rax + movabsq $0xffffffff00000000, %r10 + sbbq %r10, %rcx + sbbq $0xffffffffffffffff, %r8 + movabsq $0xfffffffeffffffff, %rdx + sbbq %rdx, %r9 + sbbq $0x0, %r11 + andq %r11, %r10 + andq %r11, %rdx + addq %r11, %rax + movq %rax, 0x40(%rsp) + adcq %r10, %rcx + movq %rcx, 0x48(%rsp) + adcq %r11, %r8 + movq %r8, 0x50(%rsp) + adcq %rdx, %r9 + movq %r9, 0x58(%rsp) + xorl %ecx, %ecx + movq 0x20(%rsp), %rdx + mulxq (%rsi), %r8, %r9 + mulxq 0x8(%rsi), %rax, %r10 + addq %rax, %r9 + mulxq 0x10(%rsi), %rax, %r11 + adcq %rax, %r10 + mulxq 0x18(%rsi), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x28(%rsp), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x30(%rsp), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x38(%rsp), %rdx + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x18(%rsi), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x60(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x68(%rsp), %r9, %r10 + mulxq 0x78(%rsp), %r11, %r12 + movq 0x70(%rsp), %rdx + mulxq 0x78(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x78(%rsp), %rdx + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x68(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x70(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x78(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq 0x40(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rsp), %r9, %r10 + mulxq 0x58(%rsp), %r11, %r12 + movq 0x50(%rsp), %rdx + mulxq 0x58(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x40(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rsp), %rdx + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq $0xffffffffffffffff, %r8 + movq %r8, %r10 + subq 0xa0(%rsp), %r8 + movabsq $0xffffffff00000000, %r9 + sbbq 0xa8(%rsp), %r9 + sbbq 0xb0(%rsp), %r10 + movabsq $0xfffffffeffffffff, %r11 + sbbq 0xb8(%rsp), %r11 + xorl %r12d, %r12d + movq $0x9, %rdx + mulxq %r8, %r8, %rax + mulxq %r9, %r9, %rcx + addq %rax, %r9 + mulxq %r10, %r10, %rax + adcq %rcx, %r10 + mulxq %r11, %r11, %rcx + adcq %rax, %r11 + adcq %rcx, %r12 + movq $0xc, %rdx + xorl %eax, %eax + mulxq 0x80(%rsp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq 0x88(%rsp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq 0x90(%rsp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq 0x98(%rsp), %rax, %rdx + adcxq %rax, %r11 + adoxq %r12, %rdx + adcq $0x1, %rdx + movq %rdx, %rax + shlq $0x20, %rax + movq %rax, %rcx + subq %rdx, %rax + addq %rdx, %r8 + adcq %rax, %r9 + adcq $0x0, %r10 + adcq %rcx, %r11 + sbbq %rdx, %rdx + notq %rdx + movabsq $0xffffffff00000000, %rax + andq %rdx, %rax + movq %rdx, %rcx + btr $0x20, %rcx + addq %rdx, %r8 + movq %r8, 0xa0(%rsp) + adcq %rax, %r9 + movq %r9, 0xa8(%rsp) + adcq %rdx, %r10 + movq %r10, 0xb0(%rsp) + adcq %rcx, %r11 + movq %r11, 0xb8(%rsp) + movq 0x40(%rsp), %rax + subq (%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x40(%rsp) + adcq %r10, %rcx + movq %rcx, 0x48(%rsp) + adcq %r11, %r8 + movq %r8, 0x50(%rsp) + adcq %rdx, %r9 + movq %r9, 0x58(%rsp) + movq 0x20(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x28(%rsp), %r9, %r10 + mulxq 0x38(%rsp), %r11, %r12 + movq 0x30(%rsp), %rdx + mulxq 0x38(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x38(%rsp), %rdx + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x28(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x30(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x38(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + xorl %ecx, %ecx + movq 0x60(%rsp), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rax, %r10 + addq %rax, %r9 + mulxq 0xb0(%rsp), %rax, %r11 + adcq %rax, %r10 + mulxq 0xb8(%rsp), %rax, %r12 + adcq %rax, %r11 + adcq %rcx, %r12 + xorl %ecx, %ecx + movq 0x68(%rsp), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %r13 + adcxq %rax, %r12 + adoxq %rcx, %r13 + adcxq %rcx, %r13 + xorl %ecx, %ecx + movq 0x70(%rsp), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb8(%rsp), %rax, %r14 + adcxq %rax, %r13 + adoxq %rcx, %r14 + adcxq %rcx, %r14 + xorl %ecx, %ecx + movq 0x78(%rsp), %rdx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0xb8(%rsp), %rax, %r15 + adcxq %rax, %r14 + adoxq %rcx, %r15 + adcxq %rcx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x40(%rsp), %rax + subq 0x20(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x28(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x30(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x38(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x40(%rdi) + adcq %r10, %rcx + movq %rcx, 0x48(%rdi) + adcq %r11, %r8 + movq %r8, 0x50(%rdi) + adcq %rdx, %r9 + movq %r9, 0x58(%rdi) + movq 0x98(%rsp), %r11 + movq %r11, %rdx + movq 0x90(%rsp), %r10 + shldq $0x2, %r10, %r11 + movq 0x88(%rsp), %r9 + shldq $0x2, %r9, %r10 + movq 0x80(%rsp), %r8 + shldq $0x2, %r8, %r9 + shlq $0x2, %r8 + shrq $0x3e, %rdx + addq $0x1, %rdx + subq 0xa0(%rsp), %r8 + sbbq 0xa8(%rsp), %r9 + sbbq 0xb0(%rsp), %r10 + sbbq 0xb8(%rsp), %r11 + sbbq $0x0, %rdx + movq %rdx, %rax + shlq $0x20, %rax + movq %rax, %rcx + subq %rdx, %rax + addq %rdx, %r8 + adcq %rax, %r9 + adcq $0x0, %r10 + adcq %rcx, %r11 + sbbq %rdx, %rdx + notq %rdx + movabsq $0xffffffff00000000, %rax + andq %rdx, %rax + movq %rdx, %rcx + btr $0x20, %rcx + addq %rdx, %r8 + movq %r8, (%rdi) + adcq %rax, %r9 + movq %r9, 0x8(%rdi) + adcq %rdx, %r10 + movq %r10, 0x10(%rdi) + adcq %rcx, %r11 + movq %r11, 0x18(%rdi) + movq $0xffffffffffffffff, %r8 + movq %r8, %r10 + subq (%rsp), %r8 + movabsq $0xffffffff00000000, %r9 + sbbq 0x8(%rsp), %r9 + sbbq 0x10(%rsp), %r10 + movabsq $0xfffffffeffffffff, %r11 + sbbq 0x18(%rsp), %r11 + movq %r11, %r12 + shldq $0x3, %r10, %r11 + shldq $0x3, %r9, %r10 + shldq $0x3, %r8, %r9 + shlq $0x3, %r8 + shrq $0x3d, %r12 + movq $0x3, %rdx + xorl %eax, %eax + mulxq 0x60(%rsp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq 0x68(%rsp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq 0x70(%rsp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq 0x78(%rsp), %rax, %rdx + adcxq %rax, %r11 + adoxq %r12, %rdx + adcq $0x1, %rdx + movq %rdx, %rax + shlq $0x20, %rax + movq %rax, %rcx + subq %rdx, %rax + addq %rdx, %r8 + adcq %rax, %r9 + adcq $0x0, %r10 + adcq %rcx, %r11 + sbbq %rdx, %rdx + notq %rdx + movabsq $0xffffffff00000000, %rax + andq %rdx, %rax + movq %rdx, %rcx + btr $0x20, %rcx + addq %rdx, %r8 + movq %r8, 0x20(%rdi) + adcq %rax, %r9 + movq %r9, 0x28(%rdi) + adcq %rdx, %r10 + movq %r10, 0x30(%rdi) + adcq %rcx, %r11 + movq %r11, 0x38(%rdi) + addq $0xc0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjscalarmul_alt.S b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjscalarmul_alt.S new file mode 100644 index 00000000000..e946fbac25d --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum-imported/x86_att/sm2/sm2_montjscalarmul_alt.S @@ -0,0 +1,4526 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery-Jacobian form scalar multiplication for GM/T 0003-2012 curve SM2 +// Input scalar[4], point[12]; output res[12] +// +// extern void sm2_montjscalarmul_alt +// (uint64_t res[static 12], +// uint64_t scalar[static 4], +// uint64_t point[static 12]); +// +// This function is a variant of its affine point version sm2_scalarmul. +// Here, input and output points are assumed to be in Jacobian form with +// their coordinates in the Montgomery domain. Thus, if priming indicates +// Montgomery form, x' = (2^256 * x) mod p_sm2 etc., each point argument +// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when +// z' is nonzero or the point at infinity (group identity) if z' = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve SM2, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_sm2) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(sm2_montjscalarmul_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(sm2_montjscalarmul_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Intermediate variables on the stack. Uppercase syntactic variants +// make x86_att version simpler to generate. + +#define SCALARB (0*NUMSIZE) +#define scalarb (0*NUMSIZE)(%rsp) +#define ACC (1*NUMSIZE) +#define acc (1*NUMSIZE)(%rsp) +#define TABENT (4*NUMSIZE) +#define tabent (4*NUMSIZE)(%rsp) + +#define TAB (7*NUMSIZE) +#define tab (7*NUMSIZE)(%rsp) + +#define res (31*NUMSIZE)(%rsp) + +#define NSPACE (32*NUMSIZE) + +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmpq $I, %rdi ; \ + cmovzq TAB+96*(I-1)(%rsp), %rax ; \ + cmovzq TAB+96*(I-1)+8(%rsp), %rbx ; \ + cmovzq TAB+96*(I-1)+16(%rsp), %rcx ; \ + cmovzq TAB+96*(I-1)+24(%rsp), %rdx ; \ + cmovzq TAB+96*(I-1)+32(%rsp), %r8 ; \ + cmovzq TAB+96*(I-1)+40(%rsp), %r9 ; \ + cmovzq TAB+96*(I-1)+48(%rsp), %r10 ; \ + cmovzq TAB+96*(I-1)+56(%rsp), %r11 ; \ + cmovzq TAB+96*(I-1)+64(%rsp), %r12 ; \ + cmovzq TAB+96*(I-1)+72(%rsp), %r13 ; \ + cmovzq TAB+96*(I-1)+80(%rsp), %r14 ; \ + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + +S2N_BN_SYMBOL(sm2_montjscalarmul_alt): + _CET_ENDBR + +// The Windows version literally calls the standard ABI version. +// This simplifies the proofs since subroutine offsets are fixed. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + callq sm2_montjscalarmul_alt_standard + popq %rsi + popq %rdi + ret + +sm2_montjscalarmul_alt_standard: +#endif + +// Real start of the standard ABI code. + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + subq $NSPACE, %rsp + +// Preserve the "res" and "point" input arguments. We load and process the +// scalar immediately so we don't bother preserving that input argument. +// Also, "point" is only needed early on and so its register gets re-used. + + movq %rdx, %rbx + movq %rdi, res + +// Load the digits of group order n_sm2 = [%r15;%r14;%r13;%r12] + + movq $0x53bbf40939d54123, %r12 + movq $0x7203df6b21c6052b, %r13 + movq $0xffffffffffffffff, %r14 + movq $0xfffffffeffffffff, %r15 + +// First, reduce the input scalar mod n_sm2, i.e. conditionally subtract n_sm2 + + movq (%rsi), %r8 + subq %r12, %r8 + movq 8(%rsi), %r9 + sbbq %r13, %r9 + movq 16(%rsi), %r10 + sbbq %r14, %r10 + movq 24(%rsi), %r11 + sbbq %r15, %r11 + + cmovcq (%rsi), %r8 + cmovcq 8(%rsi), %r9 + cmovcq 16(%rsi), %r10 + cmovcq 24(%rsi), %r11 + +// Now if the top bit of the reduced scalar is set, negate it mod n_sm2, +// i.e. do n |-> n_sm2 - n. Remember the sign in %rbp so we can +// correspondingly negate the point below. + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + movq %r11, %rbp + shrq $63, %rbp + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + +// In either case then add the recoding constant 0x08888...888 to allow +// signed digits. + + movq $0x8888888888888888, %rax + addq %rax, %r8 + adcq %rax, %r9 + adcq %rax, %r10 + adcq %rax, %r11 + btc $63, %r11 + + movq %r8, SCALARB(%rsp) + movq %r9, SCALARB+8(%rsp) + movq %r10, SCALARB+16(%rsp) + movq %r11, SCALARB+24(%rsp) + +// Set the tab[0] table entry to the input point = 1 * P, except +// that we negate it if the top bit of the scalar was set. This +// negation takes care over the y = 0 case to maintain all the +// coordinates < p_sm2 throughout, even though triples (x,y,z) +// with y = 0 can only represent a point on the curve when z = 0 +// and it represents the point at infinity regardless of x and y. + + movq (%rbx), %rax + movq %rax, TAB(%rsp) + movq 8(%rbx), %rax + movq %rax, TAB+8(%rsp) + movq 16(%rbx), %rax + movq %rax, TAB+16(%rsp) + movq 24(%rbx), %rax + movq %rax, TAB+24(%rsp) + + movq 32(%rbx), %r12 + movq %r12, %rax + movq 40(%rbx), %r13 + orq %r13, %rax + movq 48(%rbx), %r14 + movq %r14, %rcx + movq 56(%rbx), %r15 + orq %r15, %rcx + orq %rcx, %rax + cmovzq %rax, %rbp + + xorl %r11d, %r11d + movl $0x00000000ffffffff, %r9d + notq %r11 + movq %r11, %r8 + movq %r11, %r10 + xorq %r8, %r9 + btr $32, %r11 + + subq %r12, %r8 + sbbq %r13, %r9 + sbbq %r14, %r10 + sbbq %r15, %r11 + testq %rbp, %rbp + cmovzq %r12, %r8 + cmovzq %r13, %r9 + cmovzq %r14, %r10 + cmovzq %r15, %r11 + movq %r8, TAB+32(%rsp) + movq %r9, TAB+40(%rsp) + movq %r10, TAB+48(%rsp) + movq %r11, TAB+56(%rsp) + + movq 64(%rbx), %rax + movq %rax, TAB+64(%rsp) + movq 72(%rbx), %rax + movq %rax, TAB+72(%rsp) + movq 80(%rbx), %rax + movq %rax, TAB+80(%rsp) + movq 88(%rbx), %rax + movq %rax, TAB+88(%rsp) + +// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P + + leaq TAB+96*1(%rsp), %rdi + leaq TAB(%rsp), %rsi + callq sm2_montjscalarmul_alt_sm2_montjdouble + + leaq TAB+96*2(%rsp), %rdi + leaq TAB+96*1(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq sm2_montjscalarmul_alt_sm2_montjadd + + leaq TAB+96*3(%rsp), %rdi + leaq TAB+96*1(%rsp), %rsi + callq sm2_montjscalarmul_alt_sm2_montjdouble + + leaq TAB+96*4(%rsp), %rdi + leaq TAB+96*3(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq sm2_montjscalarmul_alt_sm2_montjadd + + leaq TAB+96*5(%rsp), %rdi + leaq TAB+96*2(%rsp), %rsi + callq sm2_montjscalarmul_alt_sm2_montjdouble + + leaq TAB+96*6(%rsp), %rdi + leaq TAB+96*5(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq sm2_montjscalarmul_alt_sm2_montjadd + + leaq TAB+96*7(%rsp), %rdi + leaq TAB+96*3(%rsp), %rsi + callq sm2_montjscalarmul_alt_sm2_montjdouble + +// Set up accumulator as table entry for top 4 bits (constant-time indexing) + + movq SCALARB+24(%rsp), %rdi + shrq $60, %rdi + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + + movq %rax, ACC(%rsp) + movq %rbx, ACC+8(%rsp) + movq %rcx, ACC+16(%rsp) + movq %rdx, ACC+24(%rsp) + movq %r8, ACC+32(%rsp) + movq %r9, ACC+40(%rsp) + movq %r10, ACC+48(%rsp) + movq %r11, ACC+56(%rsp) + movq %r12, ACC+64(%rsp) + movq %r13, ACC+72(%rsp) + movq %r14, ACC+80(%rsp) + movq %r15, ACC+88(%rsp) + +// Main loop over size-4 bitfield + + movl $252, %ebp + +sm2_montjscalarmul_alt_mainloop: + subq $4, %rbp + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq sm2_montjscalarmul_alt_sm2_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq sm2_montjscalarmul_alt_sm2_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq sm2_montjscalarmul_alt_sm2_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq sm2_montjscalarmul_alt_sm2_montjdouble + + movq %rbp, %rax + shrq $6, %rax + movq (%rsp,%rax,8), %rdi + movq %rbp, %rcx + shrq %cl, %rdi + andq $15, %rdi + + subq $8, %rdi + sbbq %rsi, %rsi // %rsi = sign of digit (-1 = negative) + xorq %rsi, %rdi + subq %rsi, %rdi // %rdi = absolute value of digit + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + +// Store it to "tabent" with the y coordinate optionally negated +// Again, do it carefully to give coordinates < p_sm2 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + + movq %r12, TABENT+64(%rsp) + movq %r13, TABENT+72(%rsp) + movq %r14, TABENT+80(%rsp) + movq %r15, TABENT+88(%rsp) + + xorl %r15d, %r15d + movq %r8, %rax + movl $0x00000000ffffffff, %r13d + orq %r9, %rax + notq %r15 + movq %r10, %rcx + movq %r15, %r12 + orq %r11, %rcx + movq %r15, %r14 + xorq %r12, %r13 + btr $32, %r15 + orq %rcx, %rax + cmovzq %rax, %rsi + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + testq %rsi, %rsi + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + + movq %r8, TABENT+32(%rsp) + movq %r9, TABENT+40(%rsp) + movq %r10, TABENT+48(%rsp) + movq %r11, TABENT+56(%rsp) + + leaq TABENT(%rsp), %rdx + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq sm2_montjscalarmul_alt_sm2_montjadd + + testq %rbp, %rbp + jne sm2_montjscalarmul_alt_mainloop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + movq res, %rdi + movq ACC(%rsp), %rax + movq %rax, (%rdi) + movq ACC+8(%rsp), %rax + movq %rax, 8(%rdi) + movq ACC+16(%rsp), %rax + movq %rax, 16(%rdi) + movq ACC+24(%rsp), %rax + movq %rax, 24(%rdi) + + movq ACC+32(%rsp), %rax + movq %rax, 32(%rdi) + movq ACC+40(%rsp), %rax + movq %rax, 40(%rdi) + movq ACC+48(%rsp), %rax + movq %rax, 48(%rdi) + movq ACC+56(%rsp), %rax + movq %rax, 56(%rdi) + + movq ACC+64(%rsp), %rax + movq %rax, 64(%rdi) + movq ACC+72(%rsp), %rax + movq %rax, 72(%rdi) + movq ACC+80(%rsp), %rax + movq %rax, 80(%rdi) + movq ACC+88(%rsp), %rax + movq %rax, 88(%rdi) + +// Restore stack and registers and return + + addq $NSPACE, %rsp + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + ret + +// Local copies of subroutines, complete clones at the moment + +sm2_montjscalarmul_alt_sm2_montjadd: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xe0, %rsp + movq %rdx, %rbp + movq 0x40(%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rsi), %rbx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x40(%rbp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rbp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rbp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rbp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rbp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rbp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rbp), %rbx + movq 0x48(%rbp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rbp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rbp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rbp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq 0x40(%rbp), %rax + mulq 0x20(%rsi) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq 0x40(%rbp), %rax + mulq 0x28(%rsi) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x48(%rbp), %rax + mulq 0x20(%rsi) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq 0x40(%rbp), %rax + mulq 0x30(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x48(%rbp), %rax + mulq 0x28(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x50(%rbp), %rax + mulq 0x20(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0x40(%rbp), %rax + mulq 0x38(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x48(%rbp), %rax + mulq 0x30(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x50(%rbp), %rax + mulq 0x28(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x58(%rbp), %rax + mulq 0x20(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x48(%rbp), %rax + mulq 0x38(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rbp), %rax + mulq 0x30(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x58(%rbp), %rax + mulq 0x28(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x50(%rbp), %rax + mulq 0x38(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rbp), %rax + mulq 0x30(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x58(%rbp), %rax + mulq 0x38(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0xc0(%rsp) + movq %r13, 0xc8(%rsp) + movq %r14, 0xd0(%rsp) + movq %r15, 0xd8(%rsp) + movq 0x40(%rsi), %rax + mulq 0x20(%rbp) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq 0x40(%rsi), %rax + mulq 0x28(%rbp) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x48(%rsi), %rax + mulq 0x20(%rbp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq 0x40(%rsi), %rax + mulq 0x30(%rbp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x48(%rsi), %rax + mulq 0x28(%rbp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x50(%rsi), %rax + mulq 0x20(%rbp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0x40(%rsi), %rax + mulq 0x38(%rbp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x48(%rsi), %rax + mulq 0x30(%rbp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x50(%rsi), %rax + mulq 0x28(%rbp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x58(%rsi), %rax + mulq 0x20(%rbp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x48(%rsi), %rax + mulq 0x38(%rbp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rsi), %rax + mulq 0x30(%rbp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x58(%rsi), %rax + mulq 0x28(%rbp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x50(%rsi), %rax + mulq 0x38(%rbp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rsi), %rax + mulq 0x30(%rbp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x58(%rsi), %rax + mulq 0x38(%rbp) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq (%rsp), %rax + mulq 0x0(%rbp) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq (%rsp), %rax + mulq 0x8(%rbp) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x8(%rsp), %rax + mulq 0x0(%rbp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq (%rsp), %rax + mulq 0x10(%rbp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x8(%rsp), %rax + mulq 0x8(%rbp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x10(%rsp), %rax + mulq 0x0(%rbp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq (%rsp), %rax + mulq 0x18(%rbp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x8(%rsp), %rax + mulq 0x10(%rbp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x10(%rsp), %rax + mulq 0x8(%rbp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x18(%rsp), %rax + mulq 0x0(%rbp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x8(%rsp), %rax + mulq 0x18(%rbp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rsp), %rax + mulq 0x10(%rbp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x18(%rsp), %rax + mulq 0x8(%rbp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x10(%rsp), %rax + mulq 0x18(%rbp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rsp), %rax + mulq 0x10(%rbp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x18(%rsp), %rax + mulq 0x18(%rbp) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq 0xa0(%rsp), %rax + mulq (%rsi) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq 0xa0(%rsp), %rax + mulq 0x8(%rsi) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xa8(%rsp), %rax + mulq (%rsi) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq 0xa0(%rsp), %rax + mulq 0x10(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0xa8(%rsp), %rax + mulq 0x8(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0xb0(%rsp), %rax + mulq (%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0xa0(%rsp), %rax + mulq 0x18(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0xa8(%rsp), %rax + mulq 0x10(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0xb0(%rsp), %rax + mulq 0x8(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0xb8(%rsp), %rax + mulq (%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq 0x18(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq 0x10(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0xb8(%rsp), %rax + mulq 0x8(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0xb0(%rsp), %rax + mulq 0x18(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0xb8(%rsp), %rax + mulq 0x10(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0xb8(%rsp), %rax + mulq 0x18(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq (%rsp), %rax + mulq 0x20(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq (%rsp), %rax + mulq 0x28(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x8(%rsp), %rax + mulq 0x20(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq (%rsp), %rax + mulq 0x30(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x8(%rsp), %rax + mulq 0x28(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x10(%rsp), %rax + mulq 0x20(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq (%rsp), %rax + mulq 0x38(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x8(%rsp), %rax + mulq 0x30(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x10(%rsp), %rax + mulq 0x28(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x18(%rsp), %rax + mulq 0x20(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x8(%rsp), %rax + mulq 0x38(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rsp), %rax + mulq 0x30(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x18(%rsp), %rax + mulq 0x28(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x10(%rsp), %rax + mulq 0x38(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rsp), %rax + mulq 0x30(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x18(%rsp), %rax + mulq 0x38(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq 0xa0(%rsp), %rax + mulq 0xc0(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq 0xa0(%rsp), %rax + mulq 0xc8(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xa8(%rsp), %rax + mulq 0xc0(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq 0xa0(%rsp), %rax + mulq 0xd0(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0xa8(%rsp), %rax + mulq 0xc8(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0xb0(%rsp), %rax + mulq 0xc0(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0xa0(%rsp), %rax + mulq 0xd8(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0xa8(%rsp), %rax + mulq 0xd0(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0xb0(%rsp), %rax + mulq 0xc8(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0xb8(%rsp), %rax + mulq 0xc0(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq 0xd8(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq 0xd0(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0xb8(%rsp), %rax + mulq 0xc8(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0xb0(%rsp), %rax + mulq 0xd8(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0xb8(%rsp), %rax + mulq 0xd0(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0xb8(%rsp), %rax + mulq 0xd8(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0xc0(%rsp) + movq %r13, 0xc8(%rsp) + movq %r14, 0xd0(%rsp) + movq %r15, 0xd8(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0xa0(%rsp) + adcq %r10, %rcx + movq %rcx, 0xa8(%rsp) + adcq %r11, %r8 + movq %r8, 0xb0(%rsp) + adcq %rdx, %r9 + movq %r9, 0xb8(%rsp) + movq 0x20(%rsp), %rax + subq 0xc0(%rsp), %rax + movq 0x28(%rsp), %rcx + sbbq 0xc8(%rsp), %rcx + movq 0x30(%rsp), %r8 + sbbq 0xd0(%rsp), %r8 + movq 0x38(%rsp), %r9 + sbbq 0xd8(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x20(%rsp) + adcq %r10, %rcx + movq %rcx, 0x28(%rsp) + adcq %r11, %r8 + movq %r8, 0x30(%rsp) + adcq %rdx, %r9 + movq %r9, 0x38(%rsp) + movq 0xa0(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0xa8(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0xb8(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0xb0(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0xa8(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0xb8(%rsp), %rbx + movq 0xa8(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0xa8(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0xb0(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0xb8(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x20(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x28(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x38(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x30(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x38(%rsp), %rbx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x30(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x38(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x60(%rsp), %rax + mulq 0x80(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq 0x60(%rsp), %rax + mulq 0x88(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x68(%rsp), %rax + mulq 0x80(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq 0x60(%rsp), %rax + mulq 0x90(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x68(%rsp), %rax + mulq 0x88(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x70(%rsp), %rax + mulq 0x80(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0x60(%rsp), %rax + mulq 0x98(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x68(%rsp), %rax + mulq 0x90(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x70(%rsp), %rax + mulq 0x88(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x78(%rsp), %rax + mulq 0x80(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x68(%rsp), %rax + mulq 0x98(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x70(%rsp), %rax + mulq 0x90(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x78(%rsp), %rax + mulq 0x88(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x70(%rsp), %rax + mulq 0x98(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x78(%rsp), %rax + mulq 0x90(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x78(%rsp), %rax + mulq 0x98(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x60(%rsp), %rax + mulq 0x40(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq 0x60(%rsp), %rax + mulq 0x48(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x68(%rsp), %rax + mulq 0x40(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq 0x60(%rsp), %rax + mulq 0x50(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x68(%rsp), %rax + mulq 0x48(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x70(%rsp), %rax + mulq 0x40(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0x60(%rsp), %rax + mulq 0x58(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x68(%rsp), %rax + mulq 0x50(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x70(%rsp), %rax + mulq 0x48(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x78(%rsp), %rax + mulq 0x40(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x68(%rsp), %rax + mulq 0x58(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x70(%rsp), %rax + mulq 0x50(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x78(%rsp), %rax + mulq 0x48(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x70(%rsp), %rax + mulq 0x58(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x78(%rsp), %rax + mulq 0x50(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x78(%rsp), %rax + mulq 0x58(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq (%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq %r11, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq %r11, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + movq 0xa0(%rsp), %rax + mulq 0x40(%rsi) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq 0xa0(%rsp), %rax + mulq 0x48(%rsi) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xa8(%rsp), %rax + mulq 0x40(%rsi) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq 0xa0(%rsp), %rax + mulq 0x50(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0xa8(%rsp), %rax + mulq 0x48(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0xb0(%rsp), %rax + mulq 0x40(%rsi) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0xa0(%rsp), %rax + mulq 0x58(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0xa8(%rsp), %rax + mulq 0x50(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0xb0(%rsp), %rax + mulq 0x48(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0xb8(%rsp), %rax + mulq 0x40(%rsi) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq 0x58(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq 0x50(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0xb8(%rsp), %rax + mulq 0x48(%rsi) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0xb0(%rsp), %rax + mulq 0x58(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0xb8(%rsp), %rax + mulq 0x50(%rsi) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0xb8(%rsp), %rax + mulq 0x58(%rsi) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq (%rsp), %rax + subq 0x40(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x48(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x50(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x58(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq %r11, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x80(%rsp), %rax + subq (%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq %r11, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x60(%rsp), %rax + mulq 0xc0(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq 0x60(%rsp), %rax + mulq 0xc8(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x68(%rsp), %rax + mulq 0xc0(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq 0x60(%rsp), %rax + mulq 0xd0(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x68(%rsp), %rax + mulq 0xc8(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x70(%rsp), %rax + mulq 0xc0(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0x60(%rsp), %rax + mulq 0xd8(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x68(%rsp), %rax + mulq 0xd0(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x70(%rsp), %rax + mulq 0xc8(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x78(%rsp), %rax + mulq 0xc0(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x68(%rsp), %rax + mulq 0xd8(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x70(%rsp), %rax + mulq 0xd0(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x78(%rsp), %rax + mulq 0xc8(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x70(%rsp), %rax + mulq 0xd8(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x78(%rsp), %rax + mulq 0xd0(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x78(%rsp), %rax + mulq 0xd8(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0xa0(%rsp), %rax + mulq 0x40(%rbp) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq 0xa0(%rsp), %rax + mulq 0x48(%rbp) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xa8(%rsp), %rax + mulq 0x40(%rbp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq 0xa0(%rsp), %rax + mulq 0x50(%rbp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0xa8(%rsp), %rax + mulq 0x48(%rbp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0xb0(%rsp), %rax + mulq 0x40(%rbp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0xa0(%rsp), %rax + mulq 0x58(%rbp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0xa8(%rsp), %rax + mulq 0x50(%rbp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0xb0(%rsp), %rax + mulq 0x48(%rbp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0xb8(%rsp), %rax + mulq 0x40(%rbp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq 0x58(%rbp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq 0x50(%rbp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0xb8(%rsp), %rax + mulq 0x48(%rbp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0xb0(%rsp), %rax + mulq 0x58(%rbp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0xb8(%rsp), %rax + mulq 0x50(%rbp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0xb8(%rsp), %rax + mulq 0x58(%rbp) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq 0x20(%rsp), %rax + mulq 0x80(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq 0x20(%rsp), %rax + mulq 0x88(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x28(%rsp), %rax + mulq 0x80(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq 0x20(%rsp), %rax + mulq 0x90(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x28(%rsp), %rax + mulq 0x88(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x30(%rsp), %rax + mulq 0x80(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0x20(%rsp), %rax + mulq 0x98(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x28(%rsp), %rax + mulq 0x90(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x30(%rsp), %rax + mulq 0x88(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x38(%rsp), %rax + mulq 0x80(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x28(%rsp), %rax + mulq 0x98(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rsp), %rax + mulq 0x90(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x38(%rsp), %rax + mulq 0x88(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x30(%rsp), %rax + mulq 0x98(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rsp), %rax + mulq 0x90(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x38(%rsp), %rax + mulq 0x98(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x80(%rsp), %rax + subq 0x60(%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x68(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x70(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x78(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq %r11, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x40(%rsi), %r8 + movq 0x48(%rsi), %r9 + movq 0x50(%rsi), %r10 + movq 0x58(%rsi), %r11 + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + movq 0x40(%rbp), %r12 + movq 0x48(%rbp), %r13 + movq 0x50(%rbp), %r14 + movq 0x58(%rbp), %r15 + movq %r12, %rbx + movq %r13, %rdx + orq %r14, %rbx + orq %r15, %rdx + orq %rdx, %rbx + negq %rbx + sbbq %rbx, %rbx + cmpq %rax, %rbx + cmovbq %r8, %r12 + cmovbq %r9, %r13 + cmovbq %r10, %r14 + cmovbq %r11, %r15 + cmoveq 0xa0(%rsp), %r12 + cmoveq 0xa8(%rsp), %r13 + cmoveq 0xb0(%rsp), %r14 + cmoveq 0xb8(%rsp), %r15 + movq (%rsp), %rax + cmovbq (%rsi), %rax + cmova 0x0(%rbp), %rax + movq 0x8(%rsp), %rbx + cmovbq 0x8(%rsi), %rbx + cmova 0x8(%rbp), %rbx + movq 0x10(%rsp), %rcx + cmovbq 0x10(%rsi), %rcx + cmova 0x10(%rbp), %rcx + movq 0x18(%rsp), %rdx + cmovbq 0x18(%rsi), %rdx + cmova 0x18(%rbp), %rdx + movq 0x80(%rsp), %r8 + cmovbq 0x20(%rsi), %r8 + cmova 0x20(%rbp), %r8 + movq 0x88(%rsp), %r9 + cmovbq 0x28(%rsi), %r9 + cmova 0x28(%rbp), %r9 + movq 0x90(%rsp), %r10 + cmovbq 0x30(%rsi), %r10 + cmova 0x30(%rbp), %r10 + movq 0x98(%rsp), %r11 + cmovbq 0x38(%rsi), %r11 + cmova 0x38(%rbp), %r11 + movq %rax, (%rdi) + movq %rbx, 0x8(%rdi) + movq %rcx, 0x10(%rdi) + movq %rdx, 0x18(%rdi) + movq %r8, 0x20(%rdi) + movq %r9, 0x28(%rdi) + movq %r10, 0x30(%rdi) + movq %r11, 0x38(%rdi) + movq %r12, 0x40(%rdi) + movq %r13, 0x48(%rdi) + movq %r14, 0x50(%rdi) + movq %r15, 0x58(%rdi) + addq $0xe0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +sm2_montjscalarmul_alt_sm2_montjdouble: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xc0, %rsp + movq 0x40(%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rsi), %rbx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x20(%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x28(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x38(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x30(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x20(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x28(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x38(%rsi), %rbx + movq 0x28(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x28(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x30(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x38(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq (%rsi), %rax + subq (%rsp), %rax + movq 0x8(%rsi), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x10(%rsi), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x18(%rsi), %r9 + sbbq 0x18(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq %r11, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + movq (%rsi), %rax + addq (%rsp), %rax + movq 0x8(%rsi), %rcx + adcq 0x8(%rsp), %rcx + movq 0x10(%rsi), %r8 + adcq 0x10(%rsp), %r8 + movq 0x18(%rsi), %r9 + adcq 0x18(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + subq %r11, %rax + movq %rax, 0x40(%rsp) + sbbq %r10, %rcx + movq %rcx, 0x48(%rsp) + sbbq %r11, %r8 + movq %r8, 0x50(%rsp) + sbbq %rdx, %r9 + movq %r9, 0x58(%rsp) + movq 0x40(%rsp), %rax + mulq 0x60(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq 0x40(%rsp), %rax + mulq 0x68(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x48(%rsp), %rax + mulq 0x60(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq 0x40(%rsp), %rax + mulq 0x70(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x48(%rsp), %rax + mulq 0x68(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x50(%rsp), %rax + mulq 0x60(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0x40(%rsp), %rax + mulq 0x78(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x48(%rsp), %rax + mulq 0x70(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x50(%rsp), %rax + mulq 0x68(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x58(%rsp), %rax + mulq 0x60(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x48(%rsp), %rax + mulq 0x78(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rsp), %rax + mulq 0x70(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x58(%rsp), %rax + mulq 0x68(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x50(%rsp), %rax + mulq 0x78(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rsp), %rax + mulq 0x70(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x58(%rsp), %rax + mulq 0x78(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + xorq %r11, %r11 + movq 0x20(%rsi), %rax + addq 0x40(%rsi), %rax + movq 0x28(%rsi), %rcx + adcq 0x48(%rsi), %rcx + movq 0x30(%rsi), %r8 + adcq 0x50(%rsi), %r8 + movq 0x38(%rsi), %r9 + adcq 0x58(%rsi), %r9 + adcq %r11, %r11 + subq $0xffffffffffffffff, %rax + movabsq $0xffffffff00000000, %r10 + sbbq %r10, %rcx + sbbq $0xffffffffffffffff, %r8 + movabsq $0xfffffffeffffffff, %rdx + sbbq %rdx, %r9 + sbbq $0x0, %r11 + andq %r11, %r10 + andq %r11, %rdx + addq %r11, %rax + movq %rax, 0x40(%rsp) + adcq %r10, %rcx + movq %rcx, 0x48(%rsp) + adcq %r11, %r8 + movq %r8, 0x50(%rsp) + adcq %rdx, %r9 + movq %r9, 0x58(%rsp) + movq (%rsi), %rax + mulq 0x20(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq (%rsi), %rax + mulq 0x28(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x8(%rsi), %rax + mulq 0x20(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq (%rsi), %rax + mulq 0x30(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x8(%rsi), %rax + mulq 0x28(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x10(%rsi), %rax + mulq 0x20(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq (%rsi), %rax + mulq 0x38(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x8(%rsi), %rax + mulq 0x30(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x10(%rsi), %rax + mulq 0x28(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x18(%rsi), %rax + mulq 0x20(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x8(%rsi), %rax + mulq 0x38(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rsi), %rax + mulq 0x30(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x18(%rsi), %rax + mulq 0x28(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x10(%rsi), %rax + mulq 0x38(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rsi), %rax + mulq 0x30(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x18(%rsi), %rax + mulq 0x38(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x60(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x68(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x78(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x70(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x68(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x78(%rsp), %rbx + movq 0x68(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x68(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x70(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x78(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq 0x40(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rsp), %rbx + movq 0x48(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq $0xffffffffffffffff, %r9 + movq %r9, %r11 + subq 0xa0(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq 0xa8(%rsp), %r10 + sbbq 0xb0(%rsp), %r11 + movabsq $0xfffffffeffffffff, %r12 + sbbq 0xb8(%rsp), %r12 + movq $0x9, %rcx + movq %r9, %rax + mulq %rcx + movq %rax, %r8 + movq %rdx, %r9 + movq %r10, %rax + xorl %r10d, %r10d + mulq %rcx + addq %rax, %r9 + adcq %rdx, %r10 + movq %r11, %rax + xorl %r11d, %r11d + mulq %rcx + addq %rax, %r10 + adcq %rdx, %r11 + movq %r12, %rax + xorl %r12d, %r12d + mulq %rcx + addq %rax, %r11 + adcq %rdx, %r12 + movl $0xc, %ecx + movq 0x80(%rsp), %rax + mulq %rcx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rbx, %rbx + movq 0x88(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rbx, %rbx + movq 0x90(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rbx, %rbx + movq 0x98(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + leaq 0x1(%r12), %rdx + movq %rdx, %rax + shlq $0x20, %rax + movq %rax, %rcx + subq %rdx, %rax + addq %rdx, %r8 + adcq %rax, %r9 + adcq $0x0, %r10 + adcq %rcx, %r11 + sbbq %rdx, %rdx + notq %rdx + movabsq $0xffffffff00000000, %rax + andq %rdx, %rax + movq %rdx, %rcx + btr $0x20, %rcx + addq %rdx, %r8 + movq %r8, 0xa0(%rsp) + adcq %rax, %r9 + movq %r9, 0xa8(%rsp) + adcq %rdx, %r10 + movq %r10, 0xb0(%rsp) + adcq %rcx, %r11 + movq %r11, 0xb8(%rsp) + movq 0x40(%rsp), %rax + subq (%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x40(%rsp) + adcq %r10, %rcx + movq %rcx, 0x48(%rsp) + adcq %r11, %r8 + movq %r8, 0x50(%rsp) + adcq %rdx, %r9 + movq %r9, 0x58(%rsp) + movq 0x20(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x28(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x38(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x30(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x38(%rsp), %rbx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x30(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x38(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0xa0(%rsp), %rax + mulq 0x60(%rsp) + movq %rax, %r8 + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq 0xa0(%rsp), %rax + mulq 0x68(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xa8(%rsp), %rax + mulq 0x60(%rsp) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + xorq %r12, %r12 + movq 0xa0(%rsp), %rax + mulq 0x70(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0xa8(%rsp), %rax + mulq 0x68(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0xb0(%rsp), %rax + mulq 0x60(%rsp) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0xa0(%rsp), %rax + mulq 0x78(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0xa8(%rsp), %rax + mulq 0x70(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0xb0(%rsp), %rax + mulq 0x68(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0xb8(%rsp), %rax + mulq 0x60(%rsp) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq 0x78(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq 0x70(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0xb8(%rsp), %rax + mulq 0x68(%rsp) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0xb0(%rsp), %rax + mulq 0x78(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0xb8(%rsp), %rax + mulq 0x70(%rsp) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0xb8(%rsp), %rax + mulq 0x78(%rsp) + addq %rax, %r14 + adcq %rdx, %r15 + movq %r8, %rax + shlq $0x20, %rax + movq %r8, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r8, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + sbbq %rdx, %r11 + sbbq %rbx, %r8 + movq %r9, %rax + shlq $0x20, %rax + movq %r9, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r9, %rax + sbbq $0x0, %rcx + subq %rax, %r10 + sbbq %rcx, %r11 + sbbq %rdx, %r8 + sbbq %rbx, %r9 + movq %r10, %rax + shlq $0x20, %rax + movq %r10, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r10, %rax + sbbq $0x0, %rcx + subq %rax, %r11 + sbbq %rcx, %r8 + sbbq %rdx, %r9 + sbbq %rbx, %r10 + movq %r11, %rax + shlq $0x20, %rax + movq %r11, %rcx + shrq $0x20, %rcx + movq %rax, %rdx + movq %rcx, %rbx + subq %r11, %rax + sbbq $0x0, %rcx + subq %rax, %r8 + sbbq %rcx, %r9 + sbbq %rdx, %r10 + sbbq %rbx, %r11 + xorl %eax, %eax + addq %r8, %r12 + adcq %r9, %r13 + adcq %r10, %r14 + adcq %r11, %r15 + adcq %rax, %rax + movl $0x1, %ecx + movl $0xffffffff, %edx + xorl %ebx, %ebx + addq %r12, %rcx + leaq 0x1(%rdx), %r11 + adcq %r13, %rdx + leaq -0x1(%rbx), %r8 + adcq %r14, %rbx + adcq %r15, %r11 + adcq %rax, %r8 + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbx, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x40(%rsp), %rax + subq 0x20(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x28(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x30(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x38(%rsp), %r9 + movabsq $0xffffffff00000000, %r10 + sbbq %r11, %r11 + andq %r11, %r10 + movq %r11, %rdx + btr $0x20, %rdx + addq %r11, %rax + movq %rax, 0x40(%rdi) + adcq %r10, %rcx + movq %rcx, 0x48(%rdi) + adcq %r11, %r8 + movq %r8, 0x50(%rdi) + adcq %rdx, %r9 + movq %r9, 0x58(%rdi) + movq 0x98(%rsp), %r11 + movq %r11, %rdx + movq 0x90(%rsp), %r10 + shldq $0x2, %r10, %r11 + movq 0x88(%rsp), %r9 + shldq $0x2, %r9, %r10 + movq 0x80(%rsp), %r8 + shldq $0x2, %r8, %r9 + shlq $0x2, %r8 + shrq $0x3e, %rdx + addq $0x1, %rdx + subq 0xa0(%rsp), %r8 + sbbq 0xa8(%rsp), %r9 + sbbq 0xb0(%rsp), %r10 + sbbq 0xb8(%rsp), %r11 + sbbq $0x0, %rdx + movq %rdx, %rax + shlq $0x20, %rax + movq %rax, %rcx + subq %rdx, %rax + addq %rdx, %r8 + adcq %rax, %r9 + adcq $0x0, %r10 + adcq %rcx, %r11 + sbbq %rdx, %rdx + notq %rdx + movabsq $0xffffffff00000000, %rax + andq %rdx, %rax + movq %rdx, %rcx + btr $0x20, %rcx + addq %rdx, %r8 + movq %r8, (%rdi) + adcq %rax, %r9 + movq %r9, 0x8(%rdi) + adcq %rdx, %r10 + movq %r10, 0x10(%rdi) + adcq %rcx, %r11 + movq %r11, 0x18(%rdi) + movq $0xffffffffffffffff, %r8 + movq %r8, %r10 + subq (%rsp), %r8 + movabsq $0xffffffff00000000, %r9 + sbbq 0x8(%rsp), %r9 + sbbq 0x10(%rsp), %r10 + movabsq $0xfffffffeffffffff, %r11 + sbbq 0x18(%rsp), %r11 + movq %r11, %r12 + shldq $0x3, %r10, %r11 + shldq $0x3, %r9, %r10 + shldq $0x3, %r8, %r9 + shlq $0x3, %r8 + shrq $0x3d, %r12 + movl $0x3, %ecx + movq 0x60(%rsp), %rax + mulq %rcx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rbx, %rbx + movq 0x68(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rbx, %rbx + movq 0x70(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rbx, %rbx + movq 0x78(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + leaq 0x1(%r12), %rdx + movq %rdx, %rax + shlq $0x20, %rax + movq %rax, %rcx + subq %rdx, %rax + addq %rdx, %r8 + adcq %rax, %r9 + adcq $0x0, %r10 + adcq %rcx, %r11 + sbbq %rdx, %rdx + notq %rdx + movabsq $0xffffffff00000000, %rax + andq %rdx, %rax + movq %rdx, %rcx + btr $0x20, %rcx + addq %rdx, %r8 + movq %r8, 0x20(%rdi) + adcq %rax, %r9 + movq %r9, 0x28(%rdi) + adcq %rdx, %r10 + movq %r10, 0x30(%rdi) + adcq %rcx, %r11 + movq %r11, 0x38(%rdi) + addq $0xc0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/s2n-bignum_aws-lc.h b/third_party/s2n-bignum/s2n-bignum_aws-lc.h new file mode 100644 index 00000000000..a08696d4ace --- /dev/null +++ b/third_party/s2n-bignum/s2n-bignum_aws-lc.h @@ -0,0 +1,145 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "LICENSE" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ +#ifndef S2N_BIGNUM_AWS_LC_H +#define S2N_BIGNUM_AWS_LC_H + +#include "s2n-bignum-imported/include/s2n-bignum.h" + +// ---------------------------------------------------------------------------- +// C prototypes for s2n-bignum functions used in AWS-LC +// ---------------------------------------------------------------------------- + +// For some functions there are additional variants with names ending in +// "_alt". These have the same core mathematical functionality as their +// non-"alt" versions, but can be better suited to some microarchitectures: +// +// - On x86, the "_alt" forms avoid BMI and ADX instruction set +// extensions, so will run on any x86_64 machine, even older ones +// +// - On ARM, the "_alt" forms target machines with higher multiplier +// throughput, generally offering higher performance there. +// For each of those, we define a _selector function that selects, in runtime, +// the _alt or non-_alt version to run. + +#if defined(OPENSSL_X86_64) +// On x86_64 platforms s2n-bignum uses bmi2 and adx instruction sets +// for some of the functions. These instructions are not supported by +// every x86 CPU so we have to check if they are available and in case +// they are not we fallback to slightly slower but generic implementation. +static inline uint8_t use_s2n_bignum_alt(void) { + return (!CRYPTO_is_BMI2_capable() || !CRYPTO_is_ADX_capable()); +} +#else +// On aarch64 platforms s2n-bignum has two implementations of certain +// functions -- the default one and the alternative (suffixed _alt). +// Depending on the architecture one version is faster than the other. +// Generally, the "_alt" functions are faster on architectures with higher +// multiplier throughput, for example, Graviton 3, Apple's M1 and iPhone chips. +static inline uint8_t use_s2n_bignum_alt(void) { + return CRYPTO_is_ARMv8_wide_multiplier_capable(); +} +#endif + +#define S2NBIGNUM_KSQR_16_32_TEMP_NWORDS 24 +#define S2NBIGNUM_KMUL_16_32_TEMP_NWORDS 32 +#define S2NBIGNUM_KSQR_32_64_TEMP_NWORDS 72 +#define S2NBIGNUM_KMUL_32_64_TEMP_NWORDS 96 + +static inline void p256_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]) { + if (use_s2n_bignum_alt()) { p256_montjscalarmul_alt(res, scalar, point); } + else { p256_montjscalarmul(res, scalar, point); } +} + +static inline void bignum_deamont_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]) { + if (use_s2n_bignum_alt()) { bignum_deamont_p384_alt(z, x); } + else { bignum_deamont_p384(z, x); } +} + +static inline void bignum_montmul_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]) { + if (use_s2n_bignum_alt()) { bignum_montmul_p384_alt(z, x, y); } + else { bignum_montmul_p384(z, x, y); } +} + +static inline void bignum_montsqr_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]) { + if (use_s2n_bignum_alt()) { bignum_montsqr_p384_alt(z, x); } + else { bignum_montsqr_p384(z, x); } +} + +static inline void bignum_tomont_p384_selector(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6]) { + if (use_s2n_bignum_alt()) { bignum_tomont_p384_alt(z, x); } + else { bignum_tomont_p384(z, x); } +} + +static inline void p384_montjdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 18],uint64_t p1[S2N_BIGNUM_STATIC 18]) { + if (use_s2n_bignum_alt()) { p384_montjdouble_alt(p3, p1); } + else { p384_montjdouble(p3, p1); } +} + +static inline void p384_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]) { + if (use_s2n_bignum_alt()) { p384_montjscalarmul_alt(res, scalar, point); } + else { p384_montjscalarmul(res, scalar, point); } +} + +static inline void bignum_mul_p521_selector(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9], const uint64_t y[S2N_BIGNUM_STATIC 9]) { + if (use_s2n_bignum_alt()) { bignum_mul_p521_alt(z, x, y); } + else { bignum_mul_p521(z, x, y); } +} + +static inline void bignum_sqr_p521_selector(uint64_t z[S2N_BIGNUM_STATIC 9], const uint64_t x[S2N_BIGNUM_STATIC 9]) { + if (use_s2n_bignum_alt()) { bignum_sqr_p521_alt(z, x); } + else { bignum_sqr_p521(z, x); } +} + +static inline void p521_jdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]) { + if (use_s2n_bignum_alt()) { p521_jdouble_alt(p3, p1); } + else { p521_jdouble(p3, p1); } +} + +static inline void p521_jscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]) { + if (use_s2n_bignum_alt()) { p521_jscalarmul_alt(res, scalar, point); } + else { p521_jscalarmul(res, scalar, point); } +} + +static inline void curve25519_x25519_byte_selector(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32], const uint8_t point[S2N_BIGNUM_STATIC 32]) { + if (use_s2n_bignum_alt()) { curve25519_x25519_byte_alt(res, scalar, point); } + else { curve25519_x25519_byte(res, scalar, point); } +} + +static inline void curve25519_x25519base_byte_selector(uint8_t res[S2N_BIGNUM_STATIC 32], const uint8_t scalar[S2N_BIGNUM_STATIC 32]) { + if (use_s2n_bignum_alt()) { curve25519_x25519base_byte_alt(res, scalar); } + else { curve25519_x25519base_byte(res, scalar); } +} + +static inline void bignum_madd_n25519_selector(uint64_t z[S2N_BIGNUM_STATIC 4], uint64_t x[S2N_BIGNUM_STATIC 4], uint64_t y[S2N_BIGNUM_STATIC 4], uint64_t c[S2N_BIGNUM_STATIC 4]) { + if (use_s2n_bignum_alt()) { bignum_madd_n25519_alt(z, x, y, c); } + else { bignum_madd_n25519(z, x, y, c); } +} + +static inline uint64_t edwards25519_decode_selector(uint64_t z[S2N_BIGNUM_STATIC 8], const uint8_t c[S2N_BIGNUM_STATIC 32]) { + if (use_s2n_bignum_alt()) { return edwards25519_decode_alt(z, c); } + else { return edwards25519_decode(z, c); } +} + +static inline void edwards25519_scalarmulbase_selector(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4]) { + if (use_s2n_bignum_alt()) { edwards25519_scalarmulbase_alt(res, scalar); } + else { edwards25519_scalarmulbase(res, scalar); } +} + +static inline void edwards25519_scalarmuldouble_selector(uint64_t res[S2N_BIGNUM_STATIC 8], uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 8], uint64_t bscalar[S2N_BIGNUM_STATIC 4]) { + if (use_s2n_bignum_alt()) { edwards25519_scalarmuldouble_alt(res, scalar, point, bscalar); } + else { edwards25519_scalarmuldouble(res, scalar, point, bscalar); } +} + +#endif // S2N_BIGNUM_AWS_LC_H